def semantic_scholar_alias(NAME): """ inputs a URL that's full of publication orientated links, preferably the authors scholar page. """ author_results = [] aliases = None dois, coauthors, titles, visit_urls = author_to_urls(NAME) inv_alias_dict = {} velocity = {} for d in dois: paper = sch.paper(d, timeout=32) if "authors" in paper.keys(): all_coauthors = paper["authors"] for co_name in all_coauthors: key = co_name["name"] if (NAME.split(" ")[0] in key.split(" ")[0] or key.split(" ")[0] in NAME.split(" ")[0] or NAME.split(" ")[-1] in key.split(" ")[-1]): author = sch.author(co_name["authorId"], timeout=32) if "aliases" in author.keys(): aliases = author["aliases"] return aliases
def get_info(item, soup): root = 'https://www.semanticscholar.org/paper/' paper_data = soup.find_all(class_='search-result-title') if len(paper_data) == 0: item['cite_num'] = '' item['abstract'] = '' return first_paper = paper_data[0] url = root + first_paper['href'] soup = get_soup(url) if soup is None: item['cite_num'] = '' item['abstract'] = '' return doi_data = soup.find_all(class_='doi__link') if len(doi_data) == 0: item['cite_num'] = '' item['abstract'] = '' return doi = doi_data[0].text paper = sch.paper(doi) citation = len(paper['citations']) abstract = len(paper['abstract']) item['cite_num'] = citation item['abstract'] = abstract
def fetch_from_id(self, paper_id: PaperId) -> Optional[PaperAndRefs]: """Returns an entry a""" if paper_id in self.memcache: return self.memcache[paper_id] found = self.__paper_from_db_wrapper(paper_id, True) if found: return found try: paper_dict: Dict = semanticscholar.paper(paper_id) error = len(paper_dict.keys()) == 0 except requests.exceptions.RequestException as e: print(f"[ERROR] {str(e)}") error = True if error: result = None else: result = self.__update_db(response=paper_dict) result.id = paper_dict["paperId"] result = result self.memcache[paper_id] = result return result
def query_semantic_scholar_by_id(paper_id): paper = semanticscholar.paper(paper_id, timeout=10) print(paper.keys()) print(paper['title']) for author in paper['authors']: print(author['name']) return paper
def fetch_semantic_papers(author): '''Queries semantic scholar to download info for all papers named in a given author object''' filled_papers = [] for pi in range(len(author['papers'])): paper_link = author['papers'][pi]['paperId'] paper = sch.paper(paper_link, timeout=2) filled_papers.append(paper) return filled_papers
def test_paper(self): data = sch.paper('10.1093/mind/lix.236.433', timeout=5) self.assertEqual(data['title'], 'Computing Machinery and Intelligence') self.assertRaises(Timeout, sch.paper, '10.1093/mind/lix.236.433', timeout=0.01)
def get_semantic_info(item): doi = item['doi'] paper = sch.paper(doi) if paper is None: return item citation = len(paper['citations']) abstract = break_line(paper['abstract']) item['cite_num'] = citation item['abstract'] = abstract return item
def fetch_paper_information(arxiv_id): print("Fetching: {}".format(arxiv_id)) time.sleep(2.5) paper = sch.paper("arXiv:{}".format(arxiv_id), timeout=10, include_unknown_references=True) try: return paper, [ p["arxivId"] for p in paper["references"] if p["arxivId"] is not None ] except KeyError: return paper, []
def paperid(arxiv_id): trial_num = 0 while trial_num < sch_max_trial: paper = sch.paper(arxiv_id, timeout=2) if paper == {}: print(str(trial_num + 1) + 'th sch parse error for ' + arxiv_id) else: return paper['paperId'] trial_num += 1 if trial_num < sch_max_trial: print('sleep and retry for ' + arxiv_id) time.sleep(sch_call_sleep) else: raise Exception('fatal parse error for ' + arxiv_id)
def papercollector(id, year): author = sch.author(id) paper = author["papers"] collectedpapers = [] for p in paper: if p["year"] == year: a = sch.paper(p["paperId"])["abstract"] # print(a) try: lan = detect(a) # print(lan) if lan == 'en': p["abstract"] = a collectedpapers.append(p) except TypeError: collectedpapers.append(p) print(collectedpapers) return collectedpapers
def visit_semantic_scholar_abstracts(NAME, tns, more_links): """ inputs a URL that's full of publication orientated links, preferably the authors scholar page. """ author_results = [] aliases = None dois, coauthors, titles, visit_urls = author_to_urls(NAME) for d in tqdm(dois, title="visiting abstracts"): paper = sch.paper(d, timeout=8) urlDat = {} if "citationVelocity" in paper.keys(): urlDat["citationVelocity"] = paper["citationVelocity"] if "fieldsOfStudy" in paper.keys(): urlDat["fieldsOfStudy"] = str(paper["fieldsOfStudy"]) if "numCitedBy" in paper.keys(): urlDat["numCitedBy"] = paper["numCitedBy"] # urlDat["influentialCitationCount"] = paper["influentialCitationCount"] urlDat["semantic"] = True if "url" in paper.keys(): urlDat["link"] = paper["title"] if aliases is None: if "aliases" in paper.keys(): urlDat["aliases"] = paper["aliases"] else: pass if "abstract" in paper.keys(): urlDat = text_proc(str(paper["abstract"]), urlDat) author_results.append(urlDat) author_results = [ urlDat for urlDat in author_results if not isinstance(urlDat, type(None)) ] return author_results, visit_urls
def scrape(): # pylint:disable=too-many-locals """Scrape research papers from the Semantic Scholar API""" # Determine the ids of all relevant research papers. papers = get_all_papers() for paperId in papers: # Get all relevant information for the paper: id, title, abstract, year paper = ss.paper(paperId) paperTitle = paper['title'] paperAbstract = paper['abstract'] paperYear = paper['year'] citations = paper['citations'] paperCitations = len(citations) # Put the given paper into the database. post_paper(paperId, paperTitle, paperAbstract, paperYear, paperCitations) # Get all relevant information for the author: id, name authors = paper['authors'] for author in authors: authorId = author['authorId'] authorName = author['name'] # Put the given author and writing relation into the database. post_author(authorId, authorName) post_write(paperId, authorId) # Get all references. # A reference is a paper that the current paper cites/uses. references = paper['references'] for reference in references: referenceId = reference['paperId'] referenceIsInfluential = reference['isInfluential'] post_reference(paperId, referenceId, referenceIsInfluential) # Get all citations. # A citation is a paper that cites/uses the given paper. for citation in citations: citationId = citation['paperId'] citationIsInfluential = citation['isInfluential'] post_reference(citationId, paperId, citationIsInfluential)
def __init__(self, first_doi, n_gen=2, filename='filename'): self.first_doi = first_doi self.n_gen = n_gen self.paper = sch.paper(first_doi) # semantic scholar api self.plot_name = self.paper['title'] self.filename = filename self.done_dois = [] self.redo_dois = [] self.G = nx.MultiGraph() # article titles graph self.A = nx.MultiGraph() # authors graph self.generation = 0 self.retrived_count = 0 # semantic scholar limits 100 retrievals per 5 min. sleep for 5 min when counter=99 self.first_authors_ids = [] self.max_degree = 0 self.df = pd.DataFrame() self.first() for i in range(n_gen): self.create_next_generation() self.color_by_self_citation() self.plot_html(filename=filename) self.export_csv(filename=filename)
def main(): my_author_ids = ['2511586', '3766986', '69013210'] # for whatever reason I have three ids my_paper_ids = [] for _id in my_author_ids: loop(_id, my_paper_ids) for _paper in my_paper_ids: this_paper = sch.paper(_paper, timeout=2) m_list = '' for author in this_paper['authors']: m_list = m_list + smart_str(author['name']) + ', ' my_paper = m_list + ' ' + \ smart_str(this_paper['year']) + ' ' + \ smart_str(this_paper['title']) cites = this_paper['citations'] for cite in cites: a_list = '' for author in cite['authors']: a_list = a_list + smart_str(author['name']) + ', ' if 'hubal' not in a_list.lower(): this_cite = a_list + ' ' + \ smart_str(cite['year']) + ' ' + \ smart_str(cite['title']) write_file(my_paper, this_cite)
def test_not_found(self): data = sch.paper(0, timeout=5) self.assertEqual(len(data), 0)
def create_next_generation(self): if self.generation == 0: node_list = [node for node in list(self.G)] self.done_dois.append(self.first_doi) else: node_list = [ node for node in list(self.G) if ((node not in self.done_dois) and len(node) > 1) ] # len(node)>1 to avoid ) self.done_dois += node_list + self.redo_dois for node in node_list: for doi_ in self.G.nodes[node]['citations_dois']: if self.retrived_count > 99: time.sleep(300) self.retrived_count = 0 try: paper = sch.paper(doi_, timeout=350) except KeyError: pass try: title_ = paper['title'] except KeyError: title_ = 'none' try: journal_ = paper['venue'] except KeyError: journal_ = 'none' try: abstract_ = paper['abstract'] except KeyError: abstract_ = 'none' try: year_ = paper['year'] except KeyError: year_ = 'none' try: authors_ = [] author_ids = [] for auth in paper['authors']: authors_.append(auth['name']) author_ids.append(auth['authorId']) # remove None authors_ = list(filter(None, authors_)) author_ids = list(filter(None, author_ids)) all_combinations = list( itertools.combinations(author_ids, 2)) # create all author nodes in authors' graph for comb in all_combinations: self.A.add_edge(comb[0], comb[1]) for auth in paper['authors']: # add name attribute to nodes self.A.nodes[auth['authorId']]['name'] = auth['name'] # add authorId attribute self.A.nodes[ auth['authorId']]['authorId'] = auth['authorId'] except KeyError: authors_ = 'none' author_ids = [] citations_dois_ = [] try: if len(paper['citations']) > 0: # check paper was cited for dic in paper['citations']: if dic['doi'] is not None and len(dic['doi']): citations_dois_.append(dic['doi']) citations_dois_ = list(filter( None, citations_dois_)) # remove None if len(citations_dois_) > self.max_degree: self.max_degree = len(citations_dois_) except KeyError: pass self.G.add_edge(node, doi_) self.G.nodes[doi_]['title'] = title_ self.G.nodes[doi_]['doi'] = doi_ self.G.nodes[doi_]['journal'] = journal_ self.G.nodes[doi_]['year'] = year_ self.G.nodes[doi_]['authors'] = authors_ self.G.nodes[doi_]['citations_dois'] = citations_dois_ self.G.nodes[doi_]['abstract'] = abstract_ self.G.nodes[doi_]['authorIds'] = author_ids self.G.nodes[doi_]['size'] = 2.5 if title_ == 'none' and journal_ == 'none' and year_ == 'none': self.redo_dois.append(doi_) self.retrived_count += 1 # save temporary file nx.write_gpickle( self.G, 'temp_generation_' + str(self.generation) + '.gpickle') self.generation += 1 print('generation:', self.generation) print(len(list(self.G)), 'papers were found in the next generation')
"<key id=\"d7\" for=\"node\" attr.name=\"isInfluential\" attr.type=\"boolean\"> <default>False</default> </key>\n" ) testwritefile.write( "<key id=\"d8\" for=\"node\" attr.name=\"referencedArticle\" attr.type=\"string\"> <default></default> </key>\n" ) testwritefile.write( "<key id=\"d9\" for=\"edge\" attr.name=\"aposRetratacao\" attr.type=\"boolean\"> <default>False</default> </key>\n" ) testwritefile.write( "<key id=\"d10\" for=\"edge\" attr.name=\"influential\" attr.type=\"boolean\"> <default>False</default> </key>\n" ) #para cada artigoretratado for artigo in retratados: #consumindo servico do semantic scholar paper = sch.paper(artigo['doi'] + "?include_unknown_references=true", timeout=8) #caso nao encontre o artigo na base, pula pra proxima iteracao if not paper: j = j + 1 continue paper.keys() article_url = doi_api + artigo['doi'] r = requests.get(url=article_url) response = r.json() publication_date = response["values"][0]["timestamp"].split('T')[0] title = escape(paper['title']).replace( '"', '') #titulo do artigo retratado ja com escape testwritefile.write( "<node id=\"" + artigo['doi'] +
import itertools import torch.nn.functional as F from pprint import pprint gpu = torch.device('cuda') # sample papers paper_ids = [ 'df2b0e26d0599ce3e70df8a9da02e51594e0e992', # ”BERT” '6b85b63579a916f705a8e10a49bd8d849d91b1fc', # ”GPT-3” '077f8329a7b6fa3b7c877a57b81eb6c18b5f87de', # ”RoBERTa” '8e787e925eeb7ad735a228b2b1e8dd6d9620be83' ] # A Clinical paper # get paper title and abstract papers = [sch.paper(p_id, timeout=10) for p_id in paper_ids] paper_names = ["BERT", "GPT-3", "RoBERTa", "Clinical Course"] paper_texts = [p['title'] + ' ' + p['abstract'] for p in papers] # load model and tokenizer model = AutoModel.from_pretrained("allenai/specter").to(device=gpu) tokenizer = AutoTokenizer.from_pretrained("allenai/specter") # preprocess raw text inputs = [ tokenizer(text, return_tensors='pt', truncation=True, max_length=512).to(device=gpu) for text in paper_texts ] # get embeddings embeddings = [
def test_paper(): data = sch.paper('10.1093/mind/lix.236.433') assert data['title'] == 'Computing Machinery and Intelligence'
def test_not_found(): data = sch.paper(0) assert len(data) == 0
testwritefile.write( "@prefix prism: <http://prismstandard.org/namespaces/basic/2.0/> . \n" ) j = 0 #contador de Artigos nao encontrados no SEMANTIC SCHOLAR i = 1 #pprint(data) for artigos in retratados: artigo = retratados[artigos] #print('doi: ' + str(artigo['doi'])) if i == 99: time.sleep(300) paper = sch.paper(artigo['doi'] + "", timeout=8) #caso nao encontre o artigo na base, pula pra proxima iteracao if not paper: j = j + 1 print("Artigo DOI: " + artigo['doi'] + " nao encontrado no Semantic Scholar") continue article_url = doi_api + str( artigo['doi']) #obtendo data de publicacao r = requests.get(url=article_url) response = r.json() publication_date = response["values"][0]["timestamp"].split('T')[ 0] #data de publicacao
file = open(auth_name_file, 'w', encoding='utf8') print("Analisi in corso...") for paper in author['papers']: paper_year = paper['year'] if (paper_year != None): paper_title = str(paper['title']) paper_title = paper_title.replace(',', '') count += 1 CSVstring = str(count) + "," + name + "," + str( paper_year) + "," + paper_title file.write(CSVstring) paper = sch.paper(paper['paperId']) team = "team_" + str(count) for author in paper['authors']: if (str(author['authorId']) != iD_auth): file.write("\n") CSVstring = str(count) + "," + str(author['name']) + "," + str( paper_year) + "," + paper_title + "," + team file.write(CSVstring) print("Analisi documento numero: " + str(count)) if (count > numbers_of_papers - 1): print("Analisi completata") break file.write("\n") else: