def main(): doi = _extract_doi(args.identifier[0]) if doi is None: print(item) elif args.bibtex: result = cn.content_negotiation(doi, format="bibtex") bibtex = parse_string(result, "bibtex") try: name = "".join( bibtex.entries.values()[0].persons.values()[0][0].last_names) name = name.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue") name = unidecode(name) shortdoi = _short_doi(doi)[3:] year = bibtex.entries.values()[0].fields["year"] key = "{}_{}_{}".format(name, year, shortdoi) new = BibliographyData() new.add_entry(key, bibtex.entries[bibtex.entries.keys()[0]]) print(new.to_string("bibtex")) except KeyError: print(result) else: try: result = cn.content_negotiation(doi, format=args.format) print(result) except requests.exceptions.HTTPError: print(doi) print()
def test_content_negotiation_style(): "content negotiation - style" res_apa = cn.content_negotiation( ids=u"10.1126/science.169.3946.635", format="text", style="apa" ) res_ieee = cn.content_negotiation( ids=u"10.1126/science.169.3946.635", format="text", style="ieee" ) assert res_apa != res_ieee
def references_to_bib(refs): parsed_refs = [] for ref in refs: if ref in _REFERENCE_CACHE: parsed_ref = _REFERENCE_CACHE[ref] elif ref.startswith('url:'): url = ref.split('url:')[1] parsed_ref = "@article\{{{0},\n\turl = {{{1}}}\n\}}".format( url.__hash__(), url) elif ref.startswith('doi:'): doi = ref.split('doi:')[1] parsed_ref = content_negotiation(doi, format='bibentry') else: raise ValueError('Unknown reference style for' 'reference: {}'.format(ref)) if ref not in _REFERENCE_CACHE: _REFERENCE_CACHE[ref] = parsed_ref dumpfn(_REFERENCE_CACHE, _REFERENCE_CACHE_PATH) parsed_refs.append(ref) return refs
def load_records(self, DOIs=None): """Load all crossref items as valid records""" records = cn.content_negotiation(ids=DOIs, format='citeproc-json') # Records might be a str or unicode (python 2) if not isinstance(records, list): records = [records, ] self.records = [] for r in records: data = json.loads(r) try: record = self.to_record(data) except Exception: e, v, tb = sys.exc_info() msg = _( "An error occured while loading the following DOI: {}. " "Check logs for details." ).format( data.get('DOI') ) logger.error( '{}, error: {} [{}], data: {}'.format(msg, e, v, data) ) raise DOILoaderError(msg) self.records.append(record)
def query_arXiv(title, author): """Query arXiv for the extracted data Args: title (str): The title of the paper author (List(str)): A list of the authors of the paper Raises: ExtractionError: No suitable search criteria extracted ExtractionError: Entry found but no DOI ExtractionError: No matches found Returns: str: A BibTeX entry for the queried data """ # print("Querying arXiv") if author and title: results = arxiv.query(title + " " + author[0], max_results=5) elif title: results = arxiv.query(title, max_results=5) else: raise ExtractionError("No suitable search criteria extracted") for result in results: if SequenceMatcher(None, result["title"].upper(), title.upper()).ratio() > 0.9: if result["doi"]: BibTeX = cn.content_negotiation(ids=result["doi"], format="bibentry") return BibTeX else: BibTeX = generate_bibtex_from_arXiv(result) return BibTeX else: raise ExtractionError("No matches found")
def references_to_bib(refs, check_if_valid_citation=True): """ Takes a list of reference strings and converts them to bibtex entries Args: refs ([str]): list of string references, which can be bibtex entries, digital object identifiers ("doi:DOI_GOES_HERE") or urls ("url:URL_GOES_HERE") check_if_valid_citation (bool): True checks to see if the reference generates a valid markdown-style citation. Throws ValueError if conversion is not successful. Returns: (list): list of bibtex formatted strings """ parsed_refs = [] for ref in refs: if ref in _REFERENCE_CACHE: parsed_ref = _REFERENCE_CACHE[ref] elif ref.startswith('@'): parsed_ref = ref elif ref.startswith('url:'): # uses arbitrary key url = ref.split('url:')[1] parsed_ref = """@misc{{url:{0}, url = {{{1}}} }}""".format(str(abs(url.__hash__()))[0:6], url) elif ref.startswith('doi:'): doi = ref.split('doi:')[1] parsed_ref = content_negotiation(doi, format='bibentry') elif ref.startswith('isbn:'): isbn = ref.split('isbn:')[1] parsed_ref = bibformatters['bibtex'](meta(isbn)) else: raise ValueError( 'Unknown reference style for ' 'reference: {} (please either ' 'supply a BibTeX string, or a string ' 'starting with url: followed by a URL or ' 'starting with doi: followed by a DOI)'.format(ref)) if check_if_valid_citation: try: _ = references_to_markdown(parsed_ref) except Exception as ex: raise ValueError( "Reference '{}' returned the following error.\n" "You may need to manually generate a bibtex string:\n" "{}".format(ref, ex)) if ref not in _REFERENCE_CACHE: _REFERENCE_CACHE[ref] = parsed_ref dumpfn(_REFERENCE_CACHE, _REFERENCE_CACHE_PATH) parsed_refs.append(parsed_ref) return parsed_refs
def get_crossref_metadata(title, path): """ Gets Crossref metadata, given an article's title. Then puts the metadata on the clipboard :param title: Title to search for :param path: PDF-Path, not necessary """ print "getting crossref" # Searches the Crossref API for the given title, gets best result cr = Crossref() query = cr.works(query=title, limit=1) doi = '' # Extract DOI out of Crossref answer for item in query['message']['items']: doi = item['DOI'] # Not used, but useful. Gets metadata from isbnlib, given DOI # print isbnlib.doi2tex(doi) # Gets APA citation, given DOI apa_citation = cn.content_negotiation(ids=doi, format="text", style="apa") # We could get more formats this way, but this is not used at the moment, better performance without getting these formats # rdf_citation = cn.content_negotiation(ids=doi, format="rdf-xml") # json_citation = cn.content_negotiation(ids=doi, format="citeproc-json") # bib_entry = cn.content_negotiation(ids=doi, format="bibentry") # Prettify APA citation apa_citation = prettify_UTF8_Strings(apa_citation).strip('\n') print apa_citation clp.OpenClipboard(None) citations = {} citations['APA'] = apa_citation try: citations['content'] = unicode(clp.GetClipboardData(clp.CF_TEXT), errors='replace') except: citations['content'] = 'no text content available' # Puts the citations on the clipboard clp.SetClipboardData(citation_format, json.dumps(citations)) sources = {} sources['source'] = path try: sources['content'] = unicode(clp.GetClipboardData(clp.CF_TEXT), errors='replace') except: sources['content'] = 'no text content available' # Puts the sources on the clipboard clp.SetClipboardData(src_format, json.dumps(sources)) clp.CloseClipboard()
def load_bibtex_cached(doi, reload_cache=False): cache = load_doi_cache() if reload_cache or doi not in cache: try: entry = cn.content_negotiation(ids=doi) except Exception as e: error("There was a problem contacting crossref:\n", str(e)) cache[doi] = entry store_doi_cache(cache) return cache[doi]
def make_references(publications, output_dir): """ Create reference bib file Args: publications: the list of publications output_dir: the output directory Returns: A list of reference identifiers """ log = Logger() cr = Crossref() lines = [] references = [] for i, publication in enumerate(publications): log.notice( f"Querying and formatting {i + 1} out of {len(publications)} publications" ) link = publication[LINK] title = publication[TITLE] # Check if it is a DOI url if link and "doi.org" in link: doi = urlparse(link).path.strip("/") # Extract the DOI using the title else: results = cr.works(query_bibliographic=title, limit=1) if (results["message"]["total-results"] == 0 or results["message"]["items"][0]["title"][0].lower() != title.lower()): log.warn(f'Could not find the doi for "{title}"') continue doi = results["message"]["items"][0]["DOI"] try: reference = cn.content_negotiation(doi) lines.append(reference) references.append( re.sub("^@.*{", "", reference.split("\n")[0]).strip(",")) except HTTPError: log.warn(f'Could not Create reference for "{title}"') with open(os.path.join(output_dir, "references.bib"), "w") as f: f.write("\n\n".join(lines)) return references
def crossref_publications(doi_missed): global final_result for i in set(doi_missed): try: crossrefObject=cn.content_negotiation(ids = i,format='citeproc-json') # print('Calling ',i) data=json.loads(crossrefObject) if 'published-print' in data.keys(): final_result.append(['crossref',data['title'],data['published-print']['date-parts'][0][0],data['DOI']]) else: final_result.append(['crossref',data['title'],data['published-online']['date-parts'][0][0],data['DOI']]) except requests.exceptions.HTTPError as error: print('DOI not found ',error) crossref_doi.append(i)
def make_references(publications, output_dir): """ Create reference bib file Args: publications: the list of publications output_dir: the output directory Returns: A list of reference identifiers """ log = Logger() cr = Crossref() lines = [] references = [] for i, publication in enumerate(publications): log.notice(f'Querying and formatting {i + 1} out of {len(publications)} publications') link = publication[LINK] title = publication[TITLE] # Check if it is a DOI url if link and 'doi.org' in link: doi = urlparse(link).path.strip('/') # Extract the DOI using the title else: results = cr.works(query_title=title, limit=1) if results['message']['total-results'] == 0 or \ results['message']['items'][0]['title'][0].lower() != title.lower(): log.warn(f'Could not find the doi for "{title}"') continue doi = results['message']['items'][0]['DOI'] try: reference = cn.content_negotiation(doi) lines.append(reference) references.append(re.sub('^@.*{', '', reference.split('\n')[0]).strip(',')) except HTTPError: log.warn(f'Could not Create reference for "{title}"') with open(os.path.join(output_dir, 'references.bib'), 'w') as f: f.write('\n\n'.join(lines)) return references
def batch_doi2pmid(dois): """ resolve article PMID from DOI by feeding article citation to PubMed advanced search @param dois: list of DOIs to resolve @return: list of corresponding PMIDs """ citations = [] for doi in dois: if doi[-1] == '.': doi = doi[:-1] try: # what if one fails?! cit = cn.content_negotiation(ids=doi, format="citeproc-json") if isinstance(cit, list): for c in cit: citations.append(c) else: citations.append(cit) except Exception as e: print e continue parsed_citations = [] for x in citations: try: cit = json.loads(x) except TypeError as e: print e continue parsed_cit = {} if 'page' in cit: parsed_cit['first_page'] = cit['page'].split('-')[0] if 'volume' in cit: parsed_cit['volume'] = cit['volume'] if 'container-title' in cit: parsed_cit['journal'] = cit['container-title'] if 'issued' in cit: parsed_cit['year'] = cit['issued']['date-parts'][0][0] if 'author' in cit: if 'family' in cit['author'][0]: parsed_cit['aulast'] = cit['author'][0]['family'] parsed_citations.append(parsed_cit) pmids = ecitmatch_tools.batch_pmids_for_citation(parsed_citations, debug=False) return pmids
def get_citation(self, doi, localpath=None, verbose=False): # Try loaded values first if self.citations_df is not None: matches = self.citations[(self.citations_df.doi == doi) | (self.citations_df.note == doi)] if len(matches) == 1: if verbose: print('Citation retrieved from loaded citations') return matches[0] elif len(matches) > 1: raise ValueError('Multiple loaded records found for the given doi') doifname = doi.lower().replace('/', '_') # Try localpath next if localpath is None: localpath = self.localpath if localpath is not None: for fname in Path(localpath, 'Citation').glob(doifname + '.*'): if verbose: print(f'Citation retrieved from local file {fname.name}') with open(fname, encoding='UTF-8') as f: return Citation(f.read()) # Try remote next try: record = self.cdcs.query(template='Citation', title=doifname) assert len(record) == 1 except: pass else: if verbose: print(f'Citation retrieved from remote database') return Citation(record.iloc[0].xml_content) # Lastly, download from CrossRef bibtex = cn.content_negotiation(ids=doi, format="bibtex") if verbose: print(f'Citation retrieved from CrossRef') return Citation(bibtex)
def fetch(self, doi, localdir=None, verbose=True): """ Fetches bibtex for published content. First checks localdir, then potentials github, then CrossRef. Parameters ---------- doi : str or list The reference doi to fetch content for. localdir : Path, optional The local directory for the .bib files. If not given, will use the default path in potentials/data/bibtex directory. """ localfile = self.localfilepath(doi=doi, localdir=localdir) if localfile.is_file(): # Load bibtex from file with open(localfile, encoding='UTF-8') as f: entry = f.read() if verbose: print(f'bibtex loaded {doi} from localdir') else: try: r = requests.get(f'https://github.com/usnistgov/potentials/raw/master/data/bibtex/{localfile.name}') r.raise_for_status() entry = r.text if verbose: print(f'bibtex downloaded {doi} from github') except: # Download using habanero entry = cn.content_negotiation(ids=doi, format="bibtex") if verbose: print(f'bibtex downloaded {doi} from CrossRef') # Parse and extract content parser = BibTexParser() parser.customization = convert_to_unicode bibdatabase = bibtexparser.loads(entry, parser=parser) # Set object attributes self.__doi = doi self.__bibdatabase = bibdatabase self.__content = self.__bibdatabase.entries[0]
def references_to_bib(refs): """ Takes a list of reference strings and converts them to bibtex entries Args: refs ([str]): list of string references, which can be bibtex entries, digital object identifiers ("doi:DOI_GOES_HERE") or urls ("url:URL_GOES_HERE") Returns: (list): list of bibtex formatted strings """ parsed_refs = [] for ref in refs: if ref in _REFERENCE_CACHE: parsed_ref = _REFERENCE_CACHE[ref] elif ref.startswith('@'): parsed_ref = ref elif ref.startswith('url:'): # uses arbitrary key url = ref.split('url:')[1] parsed_ref = """@misc{{url:{0}, url = {{{1}}} }}""".format(str(abs(url.__hash__()))[0:6], url) elif ref.startswith('doi:'): doi = ref.split('doi:')[1] parsed_ref = content_negotiation(doi, format='bibentry') else: raise ValueError( 'Unknown reference style for ' 'reference: {} (please either ' 'supply a BibTeX string, or a string ' 'starting with url: followed by a URL or ' 'starting with doi: followed by a DOI)'.format(ref)) if ref not in _REFERENCE_CACHE: _REFERENCE_CACHE[ref] = parsed_ref dumpfn(_REFERENCE_CACHE, _REFERENCE_CACHE_PATH) parsed_refs.append(parsed_ref) return parsed_refs
def fetch_citation(self, doi, local=None, remote=None, verbose=False): """ Retrieves a single citation based on its DOI. First, the database is checked for matches with the DOI, then with the record name. If no matches are found in the database, then the corresponding citation is downloaded from CrossRef. Parameters ---------- doi : str The citation's DOI. If the citation has no DOI, then the citation's record name should be given instead. local : bool, optional Indicates if the local location is to be searched. Default value matches the value set when the database was initialized. remote : bool, optional Indicates if the remote location is to be searched. Default value matches the value set when the database was initialized. verbose : bool, optional If True, info messages will be printed during operations. Default value is False. """ if local is not False or remote is not False: # Try fetching based on doi try: return self.get_citation(doi=doi, local=local, remote=remote, verbose=verbose) except: pass # Try fetching based on name try: return self.get_citation(name=doi, local=local, remote=remote, verbose=True) except: pass # Fetch from CrossRef if database search failed/skipped bibtex = cn.content_negotiation(ids=doi, format="bibtex") if verbose: print('Citation retrieved from CrossRef') return load_record('Citation', bibtex)
def query_crossref(title, author): """Query Crossref for extracted data Args: title (str): The title of the paper author (List(str)): A list of the authors of the paper Raises: ExtractionError: No suitable search criteria extracted ExtractionError: No suitable Crossref candidates ExtractionError: Crossref returned an error Returns: str: A BibTeX entry for the queried data """ # Search for the paper on Crossref cr = Crossref(mailto="*****@*****.**") # print("Querying Crossref") if author and title: r = cr.works(query=title + " " + author[0]) elif title: r = cr.works(query=title) else: raise ExtractionError("No suitable search criteria extracted") BibTeX = "" print(json.dumps(r), file=open("cn.json", "w")) if r["status"] == "ok": for result in r["message"]["items"]: # If the titles are similar enough if "title" in result: if SequenceMatcher(None, result["title"][0].upper(), title.upper()).ratio() > 0.9: # If the title is similar enough, perform content negotiaiton BibTeX = cn.content_negotiation(ids=result["DOI"], format="bibentry") return BibTeX else: raise ExtractionError("No suitable Crossref candidates") else: raise ExtractionError("Crossref returned an error")
def get_content(self): template_args = {} for hint in self.software_requirement_hints: for package in hint['packages']: package_name = package['package'] versions = package['version'] citation = package[SCHEMA_ORG_CITATION] if citation.startswith(HTTPS_DOI_URL): doi_name = citation.replace(HTTPS_DOI_URL, '') apa_citation = cn.content_negotiation(ids=doi_name, format="text", style="apa") else: apa_citation = citation template_args[package_name] = { 'version': versions[-1], 'citation': apa_citation } template_args['description'] = self.workflow_version_description response = requests.get(self.jinja_template_url) response.raise_for_status() template = Template(response.text) return template.render(**template_args)
def fetch_bibtex(dois, biblibrary): """ Fetches bibtex for published content. If there is locally saved content, it will load it. Otherwise, will download from CrossRef. Parameters ---------- dois : list The reference dois to fetch content for. biblibrary : str Path to the directory containing bibtex files. Returns ------- list of bibtexparser.bibdatabase.BibDatabase """ bib_databases = [] for doi in dois: fname = Path(biblibrary, bibfname(doi)) try: # Load bibtex from file with open(fname, encoding='UTF-8') as bibtex_file: bibtex = bibtex_file.read() except: # Download using habanero bibtex = cn.content_negotiation(ids=doi, format="bibtex") with open(fname, 'w', encoding='UTF-8') as bibtex_file: bibtex_file.write(bibtex) # Parse and extract content parser = BibTexParser() parser.customization = convert_to_unicode bib_databases.append(bibtexparser.loads(bibtex, parser=parser)) return bib_databases
def get_doi_citation_crossref(doi): """ get Parameters ---------- doi : str DOI in the format "https://doi.org/10.1109/5.771073" or "doi:10.5066/F70R9MFW" or "http://dx.doi.org/10.1109/5.771073" Returns ------- dict with publication information pulled from crossref site """ cite_data = json.loads(cn.content_negotiation(ids=doi, format="citeproc-json")) cite_data['geoform'] = 'publication' if 'publisher-location' in cite_data: cite_data['pubplace'] = cite_data['publisher-location'] else: cite_data['pubplace'] = 'n/a' return cite_data
def test_content_negotiation(): "content negotiation - deafult - bibtex" res = cn.content_negotiation(ids = '10.1126/science.169.3946.635') assert str == str(res).__class__
def update_contents(self, new_store_contents): """ Structure -> mpid -> BibTeX references from MP -> (optional doi lookup via Crossref) -> formatting. Formatting is very messy right now. DOI lookup and (possibly) formatting should be cached in a builder. """ struct = self.from_data(new_store_contents) if not isinstance(struct, Structure): raise PreventUpdate( "Literature mentions can only be retrieved for crystallographic " "structures at present and not molecules. Please make a feature " "request if this would be useful for you, and it will be " "prioritized." ) with MPRester() as mpr: mpids = mpr.find_structure(struct) if len(mpids) == 0: raise PreventUpdate( "No structures in the Materials Project database match this " "crystal structure, so literature mentions cannot be retrieved. " "Please submit this structure to Materials Project if you'd " "like it to be added to the Materials Project database." ) all_references = [] for mpid in mpids: all_references.append(mpr.get_materials_id_references(mpid)) self.logger.debug(f"Retrieved references for {mpid}.") if self.use_crossref: cr = Crossref(mailto=CROSSREF_MAILTO) individual_references = set() for references in all_references: individual_references.update(set(references.split("\n\n"))) # exclude Materials Proect references (these are intended to be # references for the structure specifically) refs_to_remove = set() for ref in individual_references: if "Jain2013" in ref: refs_to_remove.add(ref) individual_references -= refs_to_remove works = [cr.works(query=ref, limit=1) for ref in individual_references] self.logger.debug(f"Retrieved {len(works)} works from Crossref.") items = [ work["message"]["items"][0] for work in works if len(work["message"]["items"]) > 0 ] dois_to_item = { item["DOI"]: { "cited-by": item.get("is-referenced-by-count", 0), "score": item["score"], "title": item.get("title", None), "authors": item.get("author", []), "journal": item.get("container-title", [None])[0], "issue": item.get("issue", None), "volume": item.get("volume", None), "pages": item.get("page", None), "date-parts": item.get("issued", {}).get("date-parts", [[None]]), } for item in items if item["score"] > 40 } num_refs = len(dois_to_item) sorted_dois = sorted( list(dois_to_item.keys()), key=lambda doi: -dois_to_item[doi]["cited-by"], ) if self.use_crossref_formatting: # use Crossref to retrieve pre-formatted text # remove leading "1. " from Science CSL style refs = { doi: content_negotiation(ids=doi, format="text", style="science")[ 3: ] for doi in dois_to_item.keys() } self.logger.debug( f"Retrieved {len(refs)} formatted references from Crossref." ) md = " \n\n".join( f"> [{refs[doi]}](https://dx.doi.org/{doi}) " f"Cited by {dois_to_item[doi]['cited-by']}." for doi in sorted_dois ) formatted_references = dcc.Markdown( md, className="mpc-markdown" ) else: # else retrieve BibTeX entries to extract a nice author list # and perform our own formatting entries = { doi: content_negotiation(ids=doi, format="bibtex") for doi in sorted_dois } formatted_entries = [] for doi, entry in entries.items(): author_string = self._bibtex_entry_to_author_text(entry) journal_div = self._item_to_journal_div(dois_to_item[doi]) formatted_entries.append( html.Blockquote( [ html.A( [ html.Div( [ html.I( # necessary since titles can contain HTML for superscripts etc. dcc.Markdown( dois_to_item[doi]["title"], dangerously_allow_html=True ) ) ] ), html.Div([author_string]), html.Div( [ journal_div, html.Span( f" Cited by {dois_to_item[doi]['cited-by']}." ), ] ), ], href=f"https://dx.doi.org/{doi}", ) ], className="mpc", style={"padding-left": "1rem", "margin-bottom": "1rem"} ) ) formatted_references = html.Div(formatted_entries) else: # this uses pybtex directly on stored BibTeX entries from MP # most-accurate references and faster since no Crossref lookup # is required but no dois/hyperlinks available all_entries = {} for references in all_references: all_entries.update(Parser().parse_string(references).entries) md = self._pybtex_entries_to_markdown(all_entries) formatted_references = dcc.Markdown(md, className="mpc-markdown") num_refs = len(all_entries) return html.Div( [ Label(f"{num_refs} references found{':' if num_refs>0 else '.'}"), formatted_references, ], style={"max-height": "20rem", "overflow-y": "scroll"}, )
def batch_doi2pmid(dois): """ resolve article PMID from DOI by feeding article citation to PubMed advanced search @param dois: list of DOIs to resolve @return: list of corresponding PMIDs """ citations = [] for doi in dois: if doi[-1] == '.': doi = doi[:-1] while True: try: # what if one fails?! print('bp7', doi) cit = cn.content_negotiation(ids=doi, format="citeproc-json", timeout=300) print('bp7 end') if isinstance(cit, list): for c in cit: citations.append(c) else: citations.append(cit) break except requests.exceptions.HTTPError as e: if e.response.status_code == 503: print('retrying...', e) time.sleep(5) continue elif e.response.status_code == 500: print('500 error', e.response.json()) break else: print('UNHANDLED HTTP ERROR', e) break except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e: print('timeout or connection error, retrying', e) time.sleep(5) continue parsed_citations = [] for x in citations: print('bp8') try: cit = json.loads(x) except TypeError as e: print(e) continue parsed_cit = {} if 'page' in cit: parsed_cit['first_page'] = cit['page'].split('-')[0] if 'volume' in cit: parsed_cit['volume'] = cit['volume'] if 'container-title' in cit: parsed_cit['journal'] = cit['container-title'] if 'issued' in cit: parsed_cit['year'] = cit['issued']['date-parts'][0][0] if 'author' in cit: if 'family' in cit['author'][0]: parsed_cit['aulast'] = cit['author'][0]['family'] parsed_citations.append(parsed_cit) print('bp9') pmids = ecitmatch_tools.batch_pmids_for_citation(parsed_citations, debug=True) print('bp10') return pmids
def test_content_negotiation_citeproc_json(): "content negotiation - citeproc-json" res = cn.content_negotiation(ids = '10.1126/science.169.3946.635', format = "citeproc-json") assert str == str(res).__class__
def crossref_query(authors, title): """ Query Crossref database. Args: authors (list): a list of strings for up the first authors last names. title (str): the title of the article. filename (str): the original path of the file to link to. Returns: A tuple (bibtex, json, score) where the first element is the data in bibtex format (returned as a record/dict), the second element is the data returned in json format, and the third element is the score of the match given by Crossref. """ cr = Crossref() # works?query.title=An+Improved+Adaptive+Constraint+Aggregation+for+Integrated+Layout+and+Topology+Optimization&query.author=Gao+Zhu+Zhang+Zhou&sort=score&rows=1 # query = ['+' + name + '' for name in authors] # query = 'query.title=' + urllib.parse.quote_plus(title) + '&query.author=' + urllib.parse.quote_plus(' '.join(authors)) + '&sort=score&rows=1' # print(query) if ''.join(authors): args = dict( query_title=urllib.parse.quote_plus(title), query_author=urllib.parse.quote_plus(' '.join(authors)) ) else: args = dict( query=urllib.parse.quote_plus(title), ) x = cr.works(sort='score', limit=1, **args) # x = cr.works(query=query) assert x['status'] == "ok" # No result found if not x['message']['items']: print_score(0) return (None, [], 0) best_item = x['message']['items'][0] # print(json.dumps(best_item, indent=4)) for item in x['message']['items']: if item['score'] < best_item['score']: break else: best_item = pick_best(title, best_item, item) # Retrieve DOI and json item doi = best_item['DOI'] res_json = best_item # If the entry is invalid, return a score of 0 if 'author' not in res_json or not res_json['title']: print_score(0) return (None, res_json, 0) # Retrieve metadata as bibtex entry res_bib = cn.content_negotiation(ids=doi, format="bibentry") res_bib = re.sub('ä', 'ä', res_bib) res_bib = re.sub('Ă', 'Ö', res_bib) res_bib = re.sub('รถ', 'ö', res_bib) res_bib = re.sub('Ăź', 'ü', res_bib) res_bib = re.sub('Ěo', 'ö', res_bib) res_bib = re.sub('ďż˝', 'ø', res_bib) res_bib = re.sub('ĂŤ', 'ë', res_bib) db = bibtexparser.loads(res_bib) assert len(db.entries) == 1 res_bib = db.entries[0] # If article has subtitle(s), fix bibtex entry subtitles = None if 'subtitle' in res_json: subtitles = [x for x in res_json['subtitle'] if not str.isupper(x)] if subtitles: # Discard subtitle that are all uppercase title = ' '.join(res_json['title']) subtitle = ' '.join(subtitles) if title.lower().startswith(subtitle.lower()) or utils.simratio(title, subtitle) > 0.95: # Don't repeat title if the subtitle is too similar to the title new_title = title else: new_title = title + ": " + subtitle res_bib['title'] = new_title else: new_title = ' '.join(res_json['title']) res_bib['title'] = new_title # Post-process title res_bib['title'] = re.sub('\\*$', '', res_bib['title']) res_bib['title'] = re.sub('^[0-9]*\\. ', '', res_bib['title']) res_bib['title'] = re.sub('\\.*$', '', res_bib['title']) # If bibtex entry has a 'journal' field, then use the longest alias from the json if 'journal' in res_bib: best = "" for container in res_json['container-title']: if len(container) > len(best): best = container res_bib['journal'] = best # If entry is missing the year, set score to 0 score = res_json['score'] if 'year' not in res_bib: score = 0 # Fix incorrect year in crossref entry if 'published-print' in res_json: item = res_json['published-print'] if 'date-parts' in item and len(item['date-parts']) == 1: date = item['date-parts'][0] year = date[0] month = date[1] if len(date) > 1 else None if str(year) != res_bib['year']: res_bib['year'] = str(year) if month is None and 'month' in res_bib: del res_bib['month'] elif month is not None: assert month >= 1 and month <= 12 month_str = utils.MONTHS[month - 1] res_bib['month'] = month_str # Fix potential ambiguous author entries msg = utils.fix_author_field(res_bib, res_json) print('C: ' + nomenclature.gen_filename(res_bib)) print_score(score) # If score is above threshold, display msg from fix_author_field if score >= config.crossref_accept_threshold and msg: print(msg) # Return database entry return (res_bib, res_json, score)
def crossref_query(authors, title): """ Query Crossref database. Args: authors (list): a list of strings for up the first authors last names. title (str): the title of the article. filename (str): the original path of the file to link to. Returns: A tuple (bibtex, json, score) where the first element is the data in bibtex format (returned as a record/dict), the second element is the data returned in json format, and the third element is the score of the match given by Crossref. """ cr = Crossref() query = ['+"' + name + '"' for name in authors] query = ' '.join(query) + ' +"' + title + '"' x = cr.works(query=query) assert x['status'] == "ok" # No result found if not x['message']['items']: print_score(0) return (None, [], 0) best_item = x['message']['items'][0] for item in x['message']['items']: if item['score'] < best_item['score']: break else: best_item = pick_best(title, best_item, item) # Retrieve DOI and json item doi = best_item['DOI'] res_json = best_item # If the entry is invalid, return a score of 0 if 'author' not in res_json or not res_json['title']: print_score(0) return (None, res_json, 0) # Retrieve metadata as bibtex entry res_bib = cn.content_negotiation(ids=doi, format="bibentry") res_bib = re.sub('ä', 'ä', res_bib) res_bib = re.sub('Ă', 'Ö', res_bib) res_bib = re.sub('รถ', 'ö', res_bib) res_bib = re.sub('Ăź', 'ü', res_bib) res_bib = re.sub('Ěo', 'ö', res_bib) res_bib = re.sub('ďż˝', 'ø', res_bib) res_bib = re.sub('ĂŤ', 'ë', res_bib) db = bibtexparser.loads(res_bib) assert len(db.entries) == 1 res_bib = db.entries[0] # If article has subtitle(s), fix bibtex entry if 'subtitle' in res_json: subtitles = [x for x in res_json['subtitle'] if not str.isupper(x)] else: subtitles = [] if len(subtitles) > 0: # Discard subtitle that are all uppercase title = ' '.join(res_json['title']) subtitle = ' '.join(subtitles) if title.lower().startswith( subtitle.lower()) or utils.simratio(title, subtitle) > 0.95: # Don't repeat title if the subtitle is too similar to the title new_title = title else: new_title = title + ": " + subtitle res_bib['title'] = new_title else: new_title = ' '.join(res_json['title']) res_bib['title'] = new_title # Post-process title res_bib['title'] = re.sub('\\*$', '', res_bib['title']) res_bib['title'] = re.sub('^[0-9]*\\. ', '', res_bib['title']) res_bib['title'] = re.sub('\\.*$', '', res_bib['title']) # If bibtex entry has a 'journal' field, then use the longest alias from the json if 'journal' in res_bib: best = "" for container in res_json['container-title']: if len(container) > len(best): best = container res_bib['journal'] = best # If entry is missing the year, set score to 0 score = res_json['score'] if 'year' not in res_bib: score = 0 # Fix incorrect year in crossref entry if 'published-print' in res_json: item = res_json['published-print'] if 'date-parts' in item and len(item['date-parts']) == 1: date = item['date-parts'][0] year = date[0] month = date[1] if len(date) > 1 else None if str(year) != res_bib['year']: res_bib['year'] = str(year) if month is None and 'month' in res_bib: del res_bib['month'] elif month is not None: assert month >= 1 and month <= 12 month_str = utils.MONTHS[month - 1] res_bib['month'] = month_str # Fix potential ambiguous author entries msg = utils.fix_author_field(res_bib, res_json) print('C: ' + nomenclature.gen_filename(res_bib)) print_score(score) # If score is above threshold, display msg from fix_author_field if score >= config.crossref_accept_threshold and msg: print(msg) # Return database entry return (res_bib, res_json, score)
def test_content_negotiation_with_unicode_doi(): "content negotiation - unicode" res = cn.content_negotiation(ids=u"10.1126/science.169.3946.635") assert str == str(res).__class__
def test_content_negotiation_alt_url(): "content negotiation - alternative url" res = cn.content_negotiation( ids="10.1126/science.169.3946.635", url="http://doi.org" ) assert str == str(res).__class__
def test_content_negotiation_ids_missing(): with pytest.raises(TypeError): cn.content_negotiation()
def test_content_negotiation_raises_an_http_error_with_bad_requests(): with pytest.raises(HTTPError): res = cn.content_negotiation(ids="10.1126/foo")
def test_content_negotiation_ids_none(): with pytest.raises(TypeError): cn.content_negotiation(ids=None)