def main(conf): if conf.cache: paperstore = PaperStore() else: paperstore = None if conf.engine == "scholar": searcher = GScholarSearcher(paperstore) # elif conf.engine == "pubmed": # searcher = PubMedSearcher(paperstore) else: raise ValueError if conf.query_file: with open(conf.query_file, 'r') as f: query = f.read() else: query = conf.query print("Query:", query) results = searcher.search(query, min_year=conf.year_start, max_results=conf.max) if conf.cache: found, missing = paperstore.matchResultsWithPapers(results) papers_to_add = [Paper(res.bib, res.extra_data) for res in missing] paperstore.updatePapers(papers_to_add) writeBibtex([Paper(res.bib, res.extra_data) for res in results], conf.file)
def main(conf): if conf.cache: paperstore = PaperStore() else: paperstore = None bib_entries = loadRefsFromHTML(conf.input) results = getSearchResultsFromBib(bib_entries) if paperstore: found, missing = paperstore.matchResultsWithPapers(results) else: found = [] missing = results papers_to_add = [Paper(res.bib, res.extra_data) for res in missing] counter = 0 for res in found: if res.bib.get('url'): if addUrlIfNewWithType(res.paper, res['url'], 'endnote'): counter += 1 if res.bib.get('eprint'): if addUrlIfNewWithType(res.paper, res['eprint'], 'endnote'): counter += 1 papers_existing = [res.paper for res in found] paperstore.updatePapers(papers_existing) print('Papers found', len(papers_existing)) print('Papers not found', len(papers_to_add)) print('Added', counter, 'urls')
def loadEntriesAndSetUp(input, use_cache=True, max_results=10000000): if use_cache: paperstore = PaperStore() else: paperstore = None bib_entries = readInputBib(input) results = getSearchResultsFromBib(bib_entries, max_results) results = simpleResultDeDupe(results) if paperstore: found, missing = paperstore.matchResultsWithPapers(results) else: found = [] missing = results papers_to_add = [Paper(res.bib, res.extra_data) for res in missing] papers_existing = [mergeResultData(res, res.paper) for res in found] all_papers = papers_to_add + papers_existing # FIXME: a second dedupe is needed because it seems I'm matching the wrong paper # a total of 5 records suffer from this so it's no big deal all_papers = simpleResultDeDupe(all_papers) return paperstore, papers_to_add, papers_existing, all_papers
def set_union(a, b): res = set(a.keys()) | set(b.keys()) full_dict = merge_two_dicts(a, b) res_list = [value for key, value in full_dict.items() if key in res] return [Paper(x, {}) for x in res_list]
def set_intersect(a, b): res = set(a.keys()) & set(b.keys()) res_list = [value for key, value in a.items() if key in res] return [Paper(x, {}) for x in res_list]
def enrichMetadata(paper: Paper, identity): """ Tries to retrieve metadata from Crossref and abstract from SemanticScholar for a given paper, Google Scholar bib if all else fails :param paper: Paper instance """ paper.title = basicTitleCleaning(paper.title) original_title = paper.title if paper.pmid and not paper.extra_data.get("done_pubmed"): pubmed_scraper.enrichWithMetadata(paper) paper.extra_data['done_pubmed'] = True # if we don't have a DOI, we need to find it on Crossref if not paper.doi and not paper.extra_data.get('done_crossref', False): crossref_scraper.matchPaperFromResults(paper, identity) if paper.doi: new_bib = getBibtextFromDOI(paper.doi) paper = mergeResultData( paper, SearchResult(1, new_bib[0], 'crossref', paper.extra_data)) paper.extra_data['done_crossref'] = True # if we have a DOI and we haven't got the abstract yet if paper.doi and not paper.extra_data.get('done_semanticscholar'): semanticscholarmetadata.getMetadata(paper) paper.extra_data['done_semanticscholar'] = True # try PubMed if we still don't have a PMID if not paper.pmid and not paper.extra_data.get('done_pubmed'): # if (not paper.doi or not paper.has_full_abstract) and not paper.pmid and not paper.extra_data.get('done_pubmed'): if pubmed_scraper.matchPaperFromResults(paper, identity, ok_title_distance=0.4): pubmed_scraper.enrichWithMetadata(paper) paper.extra_data['done_pubmed'] = True # still no DOI? maybe we can get something from SemanticScholar if not paper.extra_data.get('ss_id') and not paper.extra_data.get( 'done_semanticscholar'): semanticscholarmetadata.matchPaperFromResults(paper, identity) paper.extra_data['done_semanticscholar'] = True # # time to try Scopus, see if it's behind a paywall # if not paper.doi and not paper.extra_data.get('done_scopus'): # semanticscholarmetadata.getMetadata(paper) # paper.extra_data['done_semanticscholar'] = True # if we don't have an abstract maybe it's on arXiv if not paper.has_full_abstract and not paper.extra_data.get('done_arxiv'): # if not paper.extra_data.get('done_arxiv'): arxiv_scraper.matchPaperFromResults(paper, identity, ok_title_distance=0.35) paper.extra_data['done_arxiv'] = True # try to get open access links if DOI present and missing PDF link if not paper.has_pdf_link and paper.doi and not paper.extra_data.get( 'done_unpaywall'): unpaywall_scraper.getMetadata(paper, identity) paper.extra_data['done_unpaywall'] = True # if all else has failed but we have a link to Google Scholar bib data, get that if not paper.year and paper.extra_data.get('url_scholarbib'): scholar_scraper.getBibtex(paper) if paper.title != original_title: print('Original: %s\nNew: %s' % (original_title, paper.title)) paper.bib = fixBibData(paper.bib, 1)
def getMetadata(self, paper, get_citing_papers=False): if not paper.doi and not paper.extra_data.get('ss_id'): raise ValueError('paper has no DOI or SSID') if paper.extra_data.get('ss_id'): unique_id = paper.extra_data.get('ss_id') else: unique_id = paper.doi url = 'https://api.semanticscholar.org/v1/paper/' + unique_id r = self.request(url) d = r.json() if 'error' in d: print("SemanticScholar error:", d['error']) return for field in ['abstract', 'year', 'venue']: if d.get(field): paper.bib[field] = str(d[field]) if d.get('arxivId'): paper.arxivid = d['arxivId'] for topic in d['topics']: # we really don't need to store the url, it's just # https://www.semanticscholar.org/topic/{topicId} del topic['url'] authors = self.loadSSAuthors(d['authors']) paper.bib['author'] = authorListFromDict(authors) paper.extra_data['ss_topics'] = d['topics'] paper.extra_data['ss_authors'] = d['authors'] paper.extra_data['ss_id'] = d['paperId'] if get_citing_papers: citing_papers = [] for index, citation in enumerate(d['citations']): ss_authors = semanticscholarmetadata.loadSSAuthors( citation['authors']) authors = authorListFromDict(ss_authors) bib = { 'title': citation['title'], 'author': authors, 'year': citation['year'], 'doi': citation['year'], } bib = fixBibData(bib, index) extra_data = { 'ss_id': citation['paperId'], 'ss_influential': citation['isInfluential'], 'ss_authors': ss_authors } if citation.get('arxivId'): extra_data['arxivid'] = citation.get('arxivId') new_paper = Paper(bib, extra_data) citing_papers.append(new_paper) return paper, citing_papers return paper