def works_with_prefix(prefix): filter = {'prefix': prefix} for p in iterate_publications_as_json(max_results=100000, filter=filter): if 'URL' in p: click.echo(p['URL'] + "\t", nl=False) r = requests.head(p['URL']) if r.is_redirect: click.echo(r.headers['Location']) time.sleep(0.2)
def getPapersInfo(papers, scholar_search_link, restrict, scholar_results): papers_return = [] num = 1 for paper in papers: while num <= scholar_results: title = paper['title'] queries = { 'query.bibliographic': title.lower(), 'sort': 'relevance', "select": "DOI,title,deposited,author,short-container-title" } print("Searching paper {} of {} on Crossref...".format( num, scholar_results)) num += 1 found_timestamp = 0 paper_found = Paper(title, paper['link'], scholar_search_link, paper['cites'], paper['link_pdf'], paper['year'], paper['authors']) while True: try: for el in iterate_publications_as_json(max_results=30, queries=queries): el_date = 0 if "deposited" in el and "timestamp" in el["deposited"]: el_date = int(el["deposited"]["timestamp"]) if (paper_found.DOI == None or el_date > found_timestamp ) and "title" in el and similarStrings( title.lower(), el["title"][0].lower()) > 0.75: found_timestamp = el_date if "DOI" in el: paper_found.DOI = el["DOI"].strip().lower() if "short-container-title" in el and len( el["short-container-title"]) > 0: paper_found.jurnal = el[ "short-container-title"][0] if restrict == None or restrict != 1: paper_found.setBibtex( getBibtex(paper_found.DOI)) break except ConnectionError as e: print("Wait 10 seconds and try again...") time.sleep(10) papers_return.append(paper_found) time.sleep(random.randint(1, 10)) return papers_return
def get_data(count=5): filter = {"has-abstract": "true", "type": "journal-article"} queries = {} # {'query': 'machine learning'} try: publications = iterate_publications_as_json(max_results=count, filter=filter, queries=queries) except e: print("There was an error accessing the Crossref API") else: data = [] datasource = ("Crossref API", ) datasource_url = "https://api.crossref.org/" for p in publications: # if p['language'] != 'en': # continue abstract = p["abstract"] authors = [] for author in p["author"]: authors += [author["given"] + " " + author["family"]] title = "" for t in p["title"]: title = t break links = [] for link in p["link"]: links += [link] # TODO: find most relevant link in list ref = "" if len(links) == 0 else links[0] # TODO: extract keywords from pdf (in link) if available keywords = [] data += [{ "title": title, "abstract": abstract, "keywords": [], "author": authors, "ref": ref, "datasource": datasource, "datasource_url": datasource_url, }] return data
def getPapersInfo(papers, scholar_search_link, restrict): papers_return = [] num = 1 for paper in papers: title = paper[0].lower() queries = { 'query.bibliographic': title, 'sort': 'relevance', "select": "DOI,title,deposited,author,short-container-title" } print("Searching paper {} of {} on Crossref...".format( num, len(papers))) num += 1 found_timestamp = 0 paper_found = Paper(title, paper[1], scholar_search_link, paper[2], paper[3]) for el in iterate_publications_as_json(max_results=30, queries=queries): el_date = 0 if "deposited" in el and "timestamp" in el["deposited"]: el_date = int(el["deposited"]["timestamp"]) if (paper_found.DOI == None or el_date > found_timestamp) and "title" in el and similarStrings( title, el["title"][0].lower()) > 0.75: found_timestamp = el_date if "DOI" in el: paper_found.DOI = el["DOI"].strip().lower() if "short-container-title" in el and len( el["short-container-title"]) > 0: paper_found.jurnal = el["short-container-title"][0] if restrict == None or restrict != 1: paper_found.setBibtex(getBibtex(paper_found.DOI)) papers_return.append(paper_found) time.sleep(random.randint(1, 10)) return papers_return
def _get_bibtex_crossref(self) -> None: """Internal function to fetch the bibtex entry and determine existence and uniqueness. Note: Results are cached. Returns: str: A bibtex entry. """ queries = dict( zip(["query.author", "query.title", "query.bibliographic"], self.pieces) ) queries["sort"] = "relevance" iter_pub = iterate_publications_as_json(queries=queries) try: doi = next(iter_pub)["DOI"] self._exists = True except StopIteration: self._exists = False return try: next(iter_pub) self._is_unique = False except StopIteration: self._is_unique = True # This is almost correct! We just need to change the citation key to # self.key raw_bibtex: str = get_publication_as_refstring(doi, "bibtex") # Here we assume the first line is always # @something {OLD_CITATION, # Replace OLD_CITATION with citation # Use { and , to find and replace a, b = raw_bibtex.split("{", 1) self._bibtex = a + "{" + self.key + "," + b.split(",", 1)[1] # Remove leading whitespace if necessary if self._bibtex[0] == " ": self._bibtex = self._bibtex[1:] return
def call_from_front_end(NAME): if not heroku: scholar_link = str( 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C3&q=') + str( NAME) #for link in scholar_link: # st.text(link) _, _, ar = enter_name_here(scholar_link, NAME) if heroku: filter_ = {'type': 'journal-article'} queries = {'query.author': NAME} ar = [] bi = [ p for p in iterate_publications_as_json( max_results=50, filter=filter_, queries=queries) ] for p in bi[0:9]: res = str('https://api.unpaywall.org/v2/') + str( p['DOI']) + str('?email=YOUR_EMAIL') response = requests.get(res) temp = response['best_oa_location']['url_for_pdf'] #temp=str('https://unpaywall.org/'+str(p['DOI'])) #st.text(temp) urlDat = process(temp) if not isinstance(urlDat, type(None)): ar.append(urlDat) #st.text(urlDat) (ar, trainingDats) = ar_manipulation(ar) ''' with open('data/traingDats.p','rb') as f: trainingDats_old = pickle.load(f) trainingDats.extend(trainingDats_old) with open('data/traingDats.p','wb') as f: pickle.dump(trainingDats,f) ''' return ar