def convert_article(article): """Convert article from KB format to AMCAT format. Unused keys in AMCAT format are: section, byline, length, externalid, author, addressee, uuid Args: article: representation of one article as retrieved from Delpher API. Predicate: Identifier and date are always set. Returns: An article in Amcat's representation. Postcondition: date, headline, text, and medium are always set. """ ocr = DelpherAPI.article_ocr(article['identifier']) page = article.get('page', '') return { 'date': datetime.strptime(article['date'], '%Y/%m/%d %H:%M:%S').isoformat(), 'headline': article.get('title', '%(ocr).30s...' % {'ocr': ocr}), # headline must not be empty 'medium': article.get('papertitle', 'ppn: {0}'.format(delpher.ppn)), 'text': ocr if len(ocr) > 0 else '<no text>', 'pagenr': int(page) if page.isdigit() else '', 'url': article.get('metadataKey', ''), 'metastring': json.dumps(article) # Just store all available information for potential later use. }