示例#1
0
    def convert_article(article):
        """Convert article from KB format to AMCAT format.

        Unused keys in AMCAT format are: section, byline, length, externalid, author, addressee, uuid

        Args:
            article: representation of one article as retrieved from Delpher API. Predicate: Identifier and date are
            always set.

        Returns:
            An article in Amcat's representation. Postcondition: date, headline, text, and medium are always set.
        """
        ocr = DelpherAPI.article_ocr(article['identifier'])
        page = article.get('page', '')

        return {
            'date': datetime.strptime(article['date'], '%Y/%m/%d %H:%M:%S').isoformat(),
            'headline': article.get('title', '%(ocr).30s...' % {'ocr': ocr}),  # headline must not be empty
            'medium': article.get('papertitle', 'ppn: {0}'.format(delpher.ppn)),
            'text': ocr if len(ocr) > 0 else '<no text>',
            'pagenr': int(page) if page.isdigit() else '',
            'url': article.get('metadataKey', ''),
            'metastring': json.dumps(article)  # Just store all available information for potential later use.
        }