def process_articles(input_filename, writer, mappings):
    tar = tarfile.open(input_filename, 'r:gz')
    for tar_info, journal in get_datasets(tar, types=('Article',)):
        try:
            process_article(journal['recordList'], writer, mappings)
        except Exception:
            traceback.print_exc(file=sys.stderr)
def process_articles(input_filename, writer, mappings):
    tar = tarfile.open(input_filename, 'r:gz')
    for tar_info, journal in get_datasets(tar, types=('Article',)):
        try:
            process_article(journal['recordList'], writer, mappings)
        except Exception:
            traceback.print_exc(file=sys.stderr)
tar = tarfile.open('../parsed-recent/articles-unified.bibjson.tar.gz', 'r:gz')

mappings = defaultdict(dict)

def format_article(article, mappings, records):
    authors = ', '.join(normalize_field(a, mappings, records).get('name', '-').replace(',', '').replace('.', '') for a in article.get('author', ()))
    return '%30s  %10s  %80s  %100s' % (
        article.get('doi', '')[:30].ljust(30),
        article.get('pmid', '')[:10].ljust(10),
        article.get('title', '')[:80].ljust(80),
        authors[:100].ljust(100),
    )



for tar_info, dataset in get_datasets(tar, ('Article',)):
    records = dataset['recordList']
    articles = [r for r in records if r.get('type') == 'Article']
    canonical, fields = majority_vote(records, ('Article',), mappings)

    records = dict((r['id'], r) for r in records)

    for record in records:
        v = compare(canonical, record, records)
        print v

    print '='*80
    print format_article(canonical, mappings, records)
    print '-'*80
    for article in articles:
        print format_article(article, mappings, records)
def process_errors(input_filename, writer, mappings):
    tar = tarfile.open(input_filename, 'r:gz')
    for tar_info, dataset in get_datasets(tar, types=None):
        process_error(dataset['recordList'], writer, mappings)
def process_citations(input_filename, writer, mappings):
    tar = tarfile.open(input_filename, 'r:gz')
    for tar_info, journal in get_datasets(tar, types=('Article',)):
        process_citation(journal['recordList'], writer, mappings)
def process_errors(input_filename, writer, mappings):
    tar = tarfile.open(input_filename, 'r:gz')
    for tar_info, dataset in get_datasets(tar, types=None):
        process_error(dataset['recordList'], writer, mappings)
def process_citations(input_filename, writer, mappings):
    tar = tarfile.open(input_filename, 'r:gz')
    for tar_info, journal in get_datasets(tar, types=('Article',)):
        process_citation(journal['recordList'], writer, mappings)