def test_citation_to_citeproc_pubmed_book(): """ Extracting CSL metadata from books in PubMed is not supported. Logic not implemented to parse XML returned by https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=29227604&rettype=full """ with pytest.raises(NotImplementedError): citation_to_citeproc('pmid:29227604')
def process_record(record): """ Expand a catalog record with retrieved metadata """ output = {} html_url = record.pop('html_url') output['manubot'] = { 'repo_url': record.pop('repo_url'), 'url': html_url, 'citation': f"url:{html_url}", } if 'thumbnail_url' in record: output['manubot']['thumbnail_url'] = record.pop('thumbnail_url') for publication_type in 'preprint', 'journal': citation = record.pop(f'{publication_type}_citation', None) if not citation: continue if not is_valid_citation(citation): continue output[publication_type] = { 'citation': citation, } for item in output.values(): citation = standardize_citation(item['citation']) csl_item = citation_to_citeproc(citation) if 'url' not in item and 'URL' in csl_item: item['url'] = csl_item['URL'] item['title'] = get_title(csl_item) item['authors'] = get_authors_text(csl_item) item['journal'] = get_journal(csl_item) item['date_iso'] = get_date(csl_item) item['date_human'] = get_date_summary(csl_item) item['csl_item'] = csl_item output['extras'] = record return output
def test_citation_to_citeproc_pubmed_with_numeric_month(): """ Generated from XML returned by https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=29028984&rettype=full See https://github.com/manubot/manubot/issues/69 """ citation = 'pmid:29028984' citeproc = citation_to_citeproc(citation) print(citeproc) assert citeproc['issued']['date-parts'] == [[2018, 3, 15]]
def test_citation_to_citeproc_doi_datacite(): citation = 'doi:10.7287/peerj.preprints.3100v1' citeproc = citation_to_citeproc(citation) assert citeproc['id'] == '11cb5HXoY' assert citeproc['URL'] == 'https://doi.org/10.7287/peerj.preprints.3100v1' assert citeproc['DOI'] == '10.7287/peerj.preprints.3100v1' assert citeproc['type'] == 'report' assert citeproc['title'] == 'Sci-Hub provides access to nearly all scholarly literature' authors = citeproc['author'] assert authors[0]['family'] == 'Himmelstein' assert authors[-1]['family'] == 'Greene'
def test_citation_to_citeproc_arxiv(): citation = 'arxiv:cond-mat/0703470v2' citeproc = citation_to_citeproc(citation) assert citeproc['id'] == 'ES92tcdg' assert citeproc['URL'] == 'https://arxiv.org/abs/cond-mat/0703470v2' assert citeproc['number'] == 'cond-mat/0703470v2' assert citeproc['version'] == '2' assert citeproc['type'] == 'report' assert citeproc['container-title'] == 'arXiv' assert citeproc['title'] == 'Portraits of Complex Networks' authors = citeproc['author'] assert authors[0]['literal'] == 'J. P. Bagrow' assert citeproc['DOI'] == '10.1209/0295-5075/81/68004'
def generate_csl_items(args, citation_df): """ General CSL (citeproc) items for standard_ids in citation_df. Writes references.json to disk and logs warnings for potential problems. """ # Read manual references (overrides) in JSON CSL manual_refs = load_manual_references(args.manual_references_paths) requests_cache.install_cache(args.requests_cache_path, include_get_headers=True) cache = requests_cache.get_cache() if args.clear_requests_cache: logging.info('Clearing requests-cache') requests_cache.clear() logging.info( f'requests-cache starting with {len(cache.responses)} cached responses' ) csl_items = list() failures = list() for standard_id in citation_df.standard_id.unique(): if standard_id in manual_refs: csl_items.append(manual_refs[standard_id]) continue elif standard_id.startswith('raw:'): logging.error( f'CSL JSON Data with a standard_id of {standard_id} not found in manual-references.json. ' 'Metadata must be provided for raw citations.') failures.append(standard_id) try: citeproc = citation_to_citeproc(standard_id) csl_items.append(citeproc) except Exception: logging.exception(f'Citeproc retrieval failure for {standard_id}') failures.append(standard_id) logging.info( f'requests-cache finished with {len(cache.responses)} cached responses' ) requests_cache.uninstall_cache() if failures: message = 'CSL JSON Data retrieval failed for:\n{}'.format( '\n'.join(failures)) logging.error(message) # Write JSON CSL bibliography for Pandoc. with args.references_path.open('w') as write_file: json.dump(csl_items, write_file, indent=2, ensure_ascii=False) write_file.write('\n') return csl_items
def test_citation_to_citeproc_pmc(identifier, citation_id): citation = f'pmcid:{identifier}' citeproc = citation_to_citeproc(citation) assert citeproc['id'] == citation_id assert citeproc[ 'URL'] == 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3041534/' assert citeproc[ 'container-title'] == 'Summit on Translational Bioinformatics' assert citeproc[ 'title'] == 'Secondary Use of EHR: Data Quality Issues and Informatics Opportunities' authors = citeproc['author'] assert authors[0]['family'] == 'Botsis' assert citeproc['PMID'] == '21347133' assert citeproc['PMCID'] == 'PMC3041534'
def test_citation_to_citeproc_pmc(): """ https://api.ncbi.nlm.nih.gov/lit/ctxp/v1/pmc/?format=csl&id=3041534 """ citation = f'pmcid:PMC3041534' citeproc = citation_to_citeproc(citation) assert citeproc['id'] == 'RoOhUFKU' assert citeproc['URL'] == 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3041534/' assert citeproc['container-title-short'] == 'Summit Transl Bioinform' assert citeproc['title'] == 'Secondary Use of EHR: Data Quality Issues and Informatics Opportunities' authors = citeproc['author'] assert authors[0]['family'] == 'Botsis' assert citeproc['PMID'] == '21347133' assert citeproc['PMCID'] == 'PMC3041534' assert 'generated by Manubot' in citeproc['note'] assert 'standard_id: pmcid:PMC3041534' in citeproc['note']
def test_citation_to_citeproc_pubmed_1(): """ Generated from XML returned by https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=21347133&rettype=full """ citation = 'pmid:21347133' citeproc = citation_to_citeproc(citation) assert citeproc['id'] == 'y9ONtSZ9' assert citeproc['type'] == 'article-journal' assert citeproc['URL'] == 'https://www.ncbi.nlm.nih.gov/pubmed/21347133' assert citeproc['container-title'] == 'Summit on translational bioinformatics' assert citeproc['title'] == 'Secondary Use of EHR: Data Quality Issues and Informatics Opportunities.' assert citeproc['issued']['date-parts'] == [[2010, 3, 1]] authors = citeproc['author'] assert authors[0]['given'] == 'Taxiarchis' assert authors[0]['family'] == 'Botsis' assert citeproc['PMID'] == '21347133' assert citeproc['PMCID'] == 'PMC3041534'
def test_citation_to_citeproc_pubmed_2(): """ Generated from XML returned by https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=27094199&rettype=full """ citation = 'pmid:27094199' citeproc = citation_to_citeproc(citation) print(citeproc) assert citeproc['id'] == 'alaFV9OY' assert citeproc['type'] == 'article-journal' assert citeproc['URL'] == 'https://www.ncbi.nlm.nih.gov/pubmed/27094199' assert citeproc['container-title'] == 'Circulation. Cardiovascular genetics' assert citeproc['container-title-short'] == 'Circ Cardiovasc Genet' assert citeproc['page'] == '179-84' assert citeproc['title'] == 'Genetic Association-Guided Analysis of Gene Networks for the Study of Complex Traits.' assert citeproc['issued']['date-parts'] == [[2016, 4]] authors = citeproc['author'] assert authors[0]['given'] == 'Casey S' assert authors[0]['family'] == 'Greene' assert citeproc['PMID'] == '27094199' assert citeproc['DOI'] == '10.1161/circgenetics.115.001181'
def test_citation_to_citeproc_isbn(): csl_item = citation_to_citeproc('isbn:9780387950693') assert csl_item['type'] == 'book' assert csl_item['title'] == 'Complex analysis'