示例#1
0
文件: lexisnexis.py 项目: aemal/amcat
def body_to_article(headline, byline, text, date, source, meta):
    """
    Create an Article-object based on given parameters. It raises an
    error (Medium.DoesNotExist) when the given source does not have
    an entry in the database.

    @param headline: headline of new Article-object
    @type headline: str

    @param byline: byline for new Article
    @type byline: NoneType, str

    @param text: text for new Article
    @type text: str

    @param date: date(time) for new Article
    @type date: datetime.date, datetime.datetime

    @param source: medium-label for new Article
    @type source: str

    @param meta: object containing all sorts of meta-information, most of
                 it suitable for metastring. However, some information
                 (author, length) will be extracted.
    @type meta: dictionary

    @return Article-object

    """
    log.debug("Creating article object for {headline!r}".format(**locals()))

    art = Article(headline=headline, byline=byline, text=text, date=date)

    art.medium = Medium.get_or_create(source)

    # Author / Section
    meta = meta.copy()
    art.author = meta.pop('author', None)
    art.section = meta.pop('section', None)
    if 'length' in meta:
        art.length = int(meta.pop('length').split()[0])
    else:
        art.length = art.text.count(" ")
    if 'url' in meta:
        art.url = meta.pop('url')
        art.url = re.sub("\s+", "", art.url)

    art.metastring = str(meta)

    return art
示例#2
0
    def body_to_article(self, headline, byline, text, date, source, meta):
        """
        Create an Article-object based on given parameters. It raises an
        error (Medium.DoesNotExist) when the given source does not have
        an entry in the database.

        @param headline: headline of new Article-object
        @type headline: unicode / str

        @param byline: byline for new Article
        @type byline: NoneType, unicode, str

        @param text: text for new Article
        @type text: unicode / str

        @param date: date(time) for new Article
        @type date: datetime.date, datetime.datetime

        @param source: medium-label for new Article
        @type source: unicode / str

        @param meta: object containing all sorts of meta-information, most of
                     it suitable for metastring. However, some information
                     (author, length) will be extracted.
        @type meta: dictionary

        @return Article-object

        """
        log.debug(
            "Creating article object for {headline!r}".format(**locals()))

        art = Article(headline=headline, byline=byline, text=text, date=date)

        art.medium = get_or_create(Medium, name=source)

        # Author / Section
        meta = meta.copy()
        art.author = meta.pop('author', None)
        art.section = meta.pop('section', None)
        if 'length' in meta:
            art.length = int(meta.pop('length').split()[0])
        else:
            art.length = art.text.count(" ")
        art.metastring = str(meta)

        art.project = self.options['project']

        return art
示例#3
0
    def create_article(self):
        """Convert the document object into an article"""
        art = Article()

        # All properties in _ARTICLES_PROPS are set on a new Article,
        # else in Article.metastring.
        _metastring = dict()
        for prop, value in self.getprops().items():
            value = self._convert(value)
            if prop in _ARTICLE_PROPS:
                setattr(art, prop, value)
            else:
                _metastring[prop] = value

        art.metastring = str(_metastring)
        self.article = art
        return art
示例#4
0
    def create_article(self):
        """Convert the document object into an article"""
        art = Article()

        # All properties in _ARTICLES_PROPS are set on a new Article,
        # else in Article.metastring.
        _metastring = dict()
        for prop, value in self.getprops().items():
            value = self._convert(value)
            if prop in _ARTICLE_PROPS:
                setattr(art, prop, value)
            else:
                _metastring[prop] = value

        art.metastring = str(_metastring)
        self.article = art
        return art
 def scrape_3(self, _html):
     """Some ugly MS Word format, as of 2014-03-03"""
     # Partition articles
     part = []
     articles = []
     for tag in _html.cssselect("body > div > *"):
         if tag.cssselect("hr"):
             articles.append(part)
             part = []
         else:
             part.append(tag)
     for tags in articles[1:]:
         article = Article()
         dateline = tags[1].text_content().strip()
         article = self.parse_dateline(dateline, article)
         article.headline = tags[1].text_content().strip()
         html_str = "".join([html.tostring(t) for t in tags[2:]])
         article.text = html2text(html_str)
         article.metastring = {'html': html_str}
         
         yield article
示例#6
0
    def scrape_3(self, _html):
        """Some ugly MS Word format, as of 2014-03-03"""
        # Partition articles
        part = []
        articles = []
        for tag in _html.cssselect("body > div > *"):
            if tag.cssselect("hr"):
                articles.append(part)
                part = []
            else:
                part.append(tag)
        for tags in articles[1:]:
            article = Article()
            dateline = tags[1].text_content().strip()
            article = self.parse_dateline(dateline, article)
            article.headline = tags[1].text_content().strip()
            html_str = "".join([html.tostring(t) for t in tags[2:]])
            article.text = html2text(html_str)
            article.metastring = {'html': html_str}

            yield article