def body_to_article(headline, byline, text, date, source, meta): """ Create an Article-object based on given parameters. It raises an error (Medium.DoesNotExist) when the given source does not have an entry in the database. @param headline: headline of new Article-object @type headline: str @param byline: byline for new Article @type byline: NoneType, str @param text: text for new Article @type text: str @param date: date(time) for new Article @type date: datetime.date, datetime.datetime @param source: medium-label for new Article @type source: str @param meta: object containing all sorts of meta-information, most of it suitable for metastring. However, some information (author, length) will be extracted. @type meta: dictionary @return Article-object """ log.debug("Creating article object for {headline!r}".format(**locals())) art = Article(headline=headline, byline=byline, text=text, date=date) art.medium = Medium.get_or_create(source) # Author / Section meta = meta.copy() art.author = meta.pop('author', None) art.section = meta.pop('section', None) if 'length' in meta: art.length = int(meta.pop('length').split()[0]) else: art.length = art.text.count(" ") if 'url' in meta: art.url = meta.pop('url') art.url = re.sub("\s+", "", art.url) art.metastring = str(meta) return art
def body_to_article(self, headline, byline, text, date, source, meta): """ Create an Article-object based on given parameters. It raises an error (Medium.DoesNotExist) when the given source does not have an entry in the database. @param headline: headline of new Article-object @type headline: unicode / str @param byline: byline for new Article @type byline: NoneType, unicode, str @param text: text for new Article @type text: unicode / str @param date: date(time) for new Article @type date: datetime.date, datetime.datetime @param source: medium-label for new Article @type source: unicode / str @param meta: object containing all sorts of meta-information, most of it suitable for metastring. However, some information (author, length) will be extracted. @type meta: dictionary @return Article-object """ log.debug( "Creating article object for {headline!r}".format(**locals())) art = Article(headline=headline, byline=byline, text=text, date=date) art.medium = get_or_create(Medium, name=source) # Author / Section meta = meta.copy() art.author = meta.pop('author', None) art.section = meta.pop('section', None) if 'length' in meta: art.length = int(meta.pop('length').split()[0]) else: art.length = art.text.count(" ") art.metastring = str(meta) art.project = self.options['project'] return art
def create_article(self): """Convert the document object into an article""" art = Article() # All properties in _ARTICLES_PROPS are set on a new Article, # else in Article.metastring. _metastring = dict() for prop, value in self.getprops().items(): value = self._convert(value) if prop in _ARTICLE_PROPS: setattr(art, prop, value) else: _metastring[prop] = value art.metastring = str(_metastring) self.article = art return art
def scrape_3(self, _html): """Some ugly MS Word format, as of 2014-03-03""" # Partition articles part = [] articles = [] for tag in _html.cssselect("body > div > *"): if tag.cssselect("hr"): articles.append(part) part = [] else: part.append(tag) for tags in articles[1:]: article = Article() dateline = tags[1].text_content().strip() article = self.parse_dateline(dateline, article) article.headline = tags[1].text_content().strip() html_str = "".join([html.tostring(t) for t in tags[2:]]) article.text = html2text(html_str) article.metastring = {'html': html_str} yield article