Пример #1
0
def semantic_scholar_alias(NAME):
    """
    inputs a URL that's full of publication orientated links, preferably the
    authors scholar page.
    """

    author_results = []
    aliases = None
    dois, coauthors, titles, visit_urls = author_to_urls(NAME)
    inv_alias_dict = {}
    velocity = {}
    for d in dois:
        paper = sch.paper(d, timeout=32)
        if "authors" in paper.keys():
            all_coauthors = paper["authors"]
            for co_name in all_coauthors:
                key = co_name["name"]
                if (NAME.split(" ")[0] in key.split(" ")[0]
                        or key.split(" ")[0] in NAME.split(" ")[0]
                        or NAME.split(" ")[-1] in key.split(" ")[-1]):
                    author = sch.author(co_name["authorId"], timeout=32)

                    if "aliases" in author.keys():
                        aliases = author["aliases"]
                        return aliases
Пример #2
0
def get_info(item, soup):
    root = 'https://www.semanticscholar.org/paper/'
    paper_data = soup.find_all(class_='search-result-title')
    if len(paper_data) == 0:
        item['cite_num'] = ''
        item['abstract'] = ''
        return

    first_paper = paper_data[0]
    url = root + first_paper['href']

    soup = get_soup(url)
    if soup is None:
        item['cite_num'] = ''
        item['abstract'] = ''
        return

    doi_data = soup.find_all(class_='doi__link')
    if len(doi_data) == 0:
        item['cite_num'] = ''
        item['abstract'] = ''
        return

    doi = doi_data[0].text
    paper = sch.paper(doi)
    citation = len(paper['citations'])
    abstract = len(paper['abstract'])
    item['cite_num'] = citation
    item['abstract'] = abstract
Пример #3
0
    def fetch_from_id(self, paper_id: PaperId) -> Optional[PaperAndRefs]:
        """Returns an entry a"""
        if paper_id in self.memcache:
            return self.memcache[paper_id]

        found = self.__paper_from_db_wrapper(paper_id, True)
        if found:
            return found

        try:
            paper_dict: Dict = semanticscholar.paper(paper_id)
            error = len(paper_dict.keys()) == 0
        except requests.exceptions.RequestException as e:
            print(f"[ERROR] {str(e)}")
            error = True

        if error:
            result = None
        else:
            result = self.__update_db(response=paper_dict)
            result.id = paper_dict["paperId"]
            result = result

        self.memcache[paper_id] = result
        return result
Пример #4
0
def query_semantic_scholar_by_id(paper_id):
    paper = semanticscholar.paper(paper_id, timeout=10)
    print(paper.keys())

    print(paper['title'])
    for author in paper['authors']:
        print(author['name'])
    return paper
Пример #5
0
def fetch_semantic_papers(author):
    '''Queries semantic scholar to download info for all papers 
	named in a given author object'''
    filled_papers = []
    for pi in range(len(author['papers'])):
        paper_link = author['papers'][pi]['paperId']
        paper = sch.paper(paper_link, timeout=2)
        filled_papers.append(paper)
    return filled_papers
Пример #6
0
    def test_paper(self):
        data = sch.paper('10.1093/mind/lix.236.433', timeout=5)
        self.assertEqual(data['title'],
                         'Computing Machinery and Intelligence')

        self.assertRaises(Timeout,
                          sch.paper,
                          '10.1093/mind/lix.236.433',
                          timeout=0.01)
Пример #7
0
def get_semantic_info(item):
    doi = item['doi']
    paper = sch.paper(doi)
    if paper is None:
        return item

    citation = len(paper['citations'])
    abstract = break_line(paper['abstract'])
    item['cite_num'] = citation
    item['abstract'] = abstract

    return item
Пример #8
0
def fetch_paper_information(arxiv_id):
    print("Fetching: {}".format(arxiv_id))
    time.sleep(2.5)
    paper = sch.paper("arXiv:{}".format(arxiv_id),
                      timeout=10,
                      include_unknown_references=True)
    try:
        return paper, [
            p["arxivId"] for p in paper["references"]
            if p["arxivId"] is not None
        ]
    except KeyError:
        return paper, []
Пример #9
0
def paperid(arxiv_id):
    trial_num = 0
    while trial_num < sch_max_trial:
        paper = sch.paper(arxiv_id, timeout=2)
        if paper == {}:
            print(str(trial_num + 1) + 'th sch parse error for ' + arxiv_id)
        else:
            return paper['paperId']
        trial_num += 1
        if trial_num < sch_max_trial:
            print('sleep and retry for ' + arxiv_id)
            time.sleep(sch_call_sleep)
        else:
            raise Exception('fatal parse error for ' + arxiv_id)
Пример #10
0
def papercollector(id, year):
    author = sch.author(id)
    paper = author["papers"]
    collectedpapers = []
    for p in paper:
        if p["year"] == year:
            a = sch.paper(p["paperId"])["abstract"]
            #             print(a)
            try:
                lan = detect(a)
                #                 print(lan)
                if lan == 'en':
                    p["abstract"] = a
                    collectedpapers.append(p)
            except TypeError:
                collectedpapers.append(p)
    print(collectedpapers)
    return collectedpapers
Пример #11
0
def visit_semantic_scholar_abstracts(NAME, tns, more_links):
    """
    inputs a URL that's full of publication orientated links, preferably the
    authors scholar page.
    """

    author_results = []
    aliases = None
    dois, coauthors, titles, visit_urls = author_to_urls(NAME)

    for d in tqdm(dois, title="visiting abstracts"):
        paper = sch.paper(d, timeout=8)

        urlDat = {}
        if "citationVelocity" in paper.keys():
            urlDat["citationVelocity"] = paper["citationVelocity"]
        if "fieldsOfStudy" in paper.keys():
            urlDat["fieldsOfStudy"] = str(paper["fieldsOfStudy"])
        if "numCitedBy" in paper.keys():
            urlDat["numCitedBy"] = paper["numCitedBy"]
        # urlDat["influentialCitationCount"] = paper["influentialCitationCount"]
        urlDat["semantic"] = True

        if "url" in paper.keys():
            urlDat["link"] = paper["title"]
        if aliases is None:
            if "aliases" in paper.keys():
                urlDat["aliases"] = paper["aliases"]
            else:
                pass
        if "abstract" in paper.keys():
            urlDat = text_proc(str(paper["abstract"]), urlDat)
            author_results.append(urlDat)
    author_results = [
        urlDat for urlDat in author_results
        if not isinstance(urlDat, type(None))
    ]

    return author_results, visit_urls
Пример #12
0
def scrape():  # pylint:disable=too-many-locals
    """Scrape research papers from the Semantic Scholar API"""
    # Determine the ids of all relevant research papers.
    papers = get_all_papers()
    for paperId in papers:
        # Get all relevant information for the paper: id, title, abstract, year
        paper = ss.paper(paperId)
        paperTitle = paper['title']
        paperAbstract = paper['abstract']
        paperYear = paper['year']
        citations = paper['citations']
        paperCitations = len(citations)
        # Put the given paper into the database.
        post_paper(paperId, paperTitle, paperAbstract, paperYear,
                   paperCitations)

        # Get all relevant information for the author: id, name
        authors = paper['authors']
        for author in authors:
            authorId = author['authorId']
            authorName = author['name']
            # Put the given author and writing relation into the database.
            post_author(authorId, authorName)
            post_write(paperId, authorId)

        # Get all references.
        # A reference is a paper that the current paper cites/uses.
        references = paper['references']
        for reference in references:
            referenceId = reference['paperId']
            referenceIsInfluential = reference['isInfluential']
            post_reference(paperId, referenceId, referenceIsInfluential)

        # Get all citations.
        # A citation is a paper that cites/uses the given paper.
        for citation in citations:
            citationId = citation['paperId']
            citationIsInfluential = citation['isInfluential']
            post_reference(citationId, paperId, citationIsInfluential)
Пример #13
0
    def __init__(self, first_doi, n_gen=2, filename='filename'):
        self.first_doi = first_doi
        self.n_gen = n_gen
        self.paper = sch.paper(first_doi)  # semantic scholar api
        self.plot_name = self.paper['title']
        self.filename = filename
        self.done_dois = []
        self.redo_dois = []
        self.G = nx.MultiGraph()  # article titles graph
        self.A = nx.MultiGraph()  # authors graph
        self.generation = 0
        self.retrived_count = 0  # semantic scholar limits 100 retrievals per 5 min. sleep for 5 min when counter=99
        self.first_authors_ids = []
        self.max_degree = 0
        self.df = pd.DataFrame()

        self.first()
        for i in range(n_gen):
            self.create_next_generation()
        self.color_by_self_citation()
        self.plot_html(filename=filename)
        self.export_csv(filename=filename)
Пример #14
0
def main():
    my_author_ids = ['2511586', '3766986',
                     '69013210']  # for whatever reason I have three ids
    my_paper_ids = []
    for _id in my_author_ids:
        loop(_id, my_paper_ids)
    for _paper in my_paper_ids:
        this_paper = sch.paper(_paper, timeout=2)
        m_list = ''
        for author in this_paper['authors']:
            m_list = m_list + smart_str(author['name']) + ', '
        my_paper = m_list + ' ' + \
            smart_str(this_paper['year']) + ' ' + \
            smart_str(this_paper['title'])
        cites = this_paper['citations']
        for cite in cites:
            a_list = ''
            for author in cite['authors']:
                a_list = a_list + smart_str(author['name']) + ', '
            if 'hubal' not in a_list.lower():
                this_cite = a_list + ' ' + \
                            smart_str(cite['year']) + ' ' + \
                            smart_str(cite['title'])
            write_file(my_paper, this_cite)
Пример #15
0
 def test_not_found(self):
     data = sch.paper(0, timeout=5)
     self.assertEqual(len(data), 0)
Пример #16
0
    def create_next_generation(self):

        if self.generation == 0:
            node_list = [node for node in list(self.G)]
            self.done_dois.append(self.first_doi)
        else:
            node_list = [
                node for node in list(self.G)
                if ((node not in self.done_dois) and len(node) > 1)
            ]  # len(node)>1 to avoid )
            self.done_dois += node_list + self.redo_dois
        for node in node_list:
            for doi_ in self.G.nodes[node]['citations_dois']:
                if self.retrived_count > 99:
                    time.sleep(300)
                    self.retrived_count = 0
                try:
                    paper = sch.paper(doi_, timeout=350)
                except KeyError:
                    pass
                try:
                    title_ = paper['title']
                except KeyError:
                    title_ = 'none'
                try:
                    journal_ = paper['venue']
                except KeyError:
                    journal_ = 'none'
                try:
                    abstract_ = paper['abstract']
                except KeyError:
                    abstract_ = 'none'
                try:
                    year_ = paper['year']
                except KeyError:
                    year_ = 'none'
                try:
                    authors_ = []
                    author_ids = []
                    for auth in paper['authors']:
                        authors_.append(auth['name'])
                        author_ids.append(auth['authorId'])
                        # remove None
                        authors_ = list(filter(None, authors_))
                        author_ids = list(filter(None, author_ids))
                    all_combinations = list(
                        itertools.combinations(author_ids, 2))
                    # create all author nodes in authors' graph
                    for comb in all_combinations:
                        self.A.add_edge(comb[0], comb[1])
                    for auth in paper['authors']:
                        # add name attribute to nodes
                        self.A.nodes[auth['authorId']]['name'] = auth['name']
                        # add authorId attribute
                        self.A.nodes[
                            auth['authorId']]['authorId'] = auth['authorId']
                except KeyError:
                    authors_ = 'none'
                    author_ids = []
                citations_dois_ = []

                try:
                    if len(paper['citations']) > 0:  # check paper was cited
                        for dic in paper['citations']:
                            if dic['doi'] is not None and len(dic['doi']):
                                citations_dois_.append(dic['doi'])
                        citations_dois_ = list(filter(
                            None, citations_dois_))  # remove None
                        if len(citations_dois_) > self.max_degree:
                            self.max_degree = len(citations_dois_)
                except KeyError:
                    pass

                self.G.add_edge(node, doi_)
                self.G.nodes[doi_]['title'] = title_
                self.G.nodes[doi_]['doi'] = doi_
                self.G.nodes[doi_]['journal'] = journal_
                self.G.nodes[doi_]['year'] = year_
                self.G.nodes[doi_]['authors'] = authors_
                self.G.nodes[doi_]['citations_dois'] = citations_dois_
                self.G.nodes[doi_]['abstract'] = abstract_
                self.G.nodes[doi_]['authorIds'] = author_ids
                self.G.nodes[doi_]['size'] = 2.5
                if title_ == 'none' and journal_ == 'none' and year_ == 'none':
                    self.redo_dois.append(doi_)
                self.retrived_count += 1
        # save temporary file
        nx.write_gpickle(
            self.G, 'temp_generation_' + str(self.generation) + '.gpickle')
        self.generation += 1
        print('generation:', self.generation)
        print(len(list(self.G)), 'papers were found in the next generation')
Пример #17
0
        "<key id=\"d7\" for=\"node\" attr.name=\"isInfluential\" attr.type=\"boolean\"> <default>False</default> </key>\n"
    )
    testwritefile.write(
        "<key id=\"d8\" for=\"node\" attr.name=\"referencedArticle\" attr.type=\"string\"> <default></default> </key>\n"
    )
    testwritefile.write(
        "<key id=\"d9\" for=\"edge\" attr.name=\"aposRetratacao\" attr.type=\"boolean\"> <default>False</default> </key>\n"
    )
    testwritefile.write(
        "<key id=\"d10\" for=\"edge\" attr.name=\"influential\" attr.type=\"boolean\"> <default>False</default> </key>\n"
    )

    #para cada artigoretratado
    for artigo in retratados:
        #consumindo servico do semantic scholar
        paper = sch.paper(artigo['doi'] + "?include_unknown_references=true",
                          timeout=8)

        #caso nao encontre o artigo na base, pula pra proxima iteracao
        if not paper:
            j = j + 1
            continue

        paper.keys()
        article_url = doi_api + artigo['doi']
        r = requests.get(url=article_url)
        response = r.json()
        publication_date = response["values"][0]["timestamp"].split('T')[0]
        title = escape(paper['title']).replace(
            '"', '')  #titulo do artigo retratado ja com escape
        testwritefile.write(
            "<node id=\"" + artigo['doi'] +
Пример #18
0
import itertools
import torch.nn.functional as F
from pprint import pprint

gpu = torch.device('cuda')

# sample papers
paper_ids = [
    'df2b0e26d0599ce3e70df8a9da02e51594e0e992',  # ”BERT”
    '6b85b63579a916f705a8e10a49bd8d849d91b1fc',  # ”GPT-3”
    '077f8329a7b6fa3b7c877a57b81eb6c18b5f87de',  # ”RoBERTa”
    '8e787e925eeb7ad735a228b2b1e8dd6d9620be83'
]  # A Clinical paper

# get paper title and abstract
papers = [sch.paper(p_id, timeout=10) for p_id in paper_ids]
paper_names = ["BERT", "GPT-3", "RoBERTa", "Clinical Course"]
paper_texts = [p['title'] + ' ' + p['abstract'] for p in papers]

# load model and tokenizer
model = AutoModel.from_pretrained("allenai/specter").to(device=gpu)
tokenizer = AutoTokenizer.from_pretrained("allenai/specter")

# preprocess raw text
inputs = [
    tokenizer(text, return_tensors='pt', truncation=True,
              max_length=512).to(device=gpu) for text in paper_texts
]

# get embeddings
embeddings = [
def test_paper():
    data = sch.paper('10.1093/mind/lix.236.433')
    assert data['title'] == 'Computing Machinery and Intelligence'
def test_not_found():
    data = sch.paper(0)
    assert len(data) == 0
Пример #21
0
        testwritefile.write(
            "@prefix prism: <http://prismstandard.org/namespaces/basic/2.0/> .  \n"
        )

        j = 0  #contador de Artigos nao encontrados no SEMANTIC SCHOLAR
        i = 1
        #pprint(data)
        for artigos in retratados:

            artigo = retratados[artigos]
            #print('doi: ' + str(artigo['doi']))

            if i == 99:
                time.sleep(300)

            paper = sch.paper(artigo['doi'] + "", timeout=8)

            #caso nao encontre o artigo na base, pula pra proxima iteracao
            if not paper:
                j = j + 1
                print("Artigo DOI: " + artigo['doi'] +
                      " nao encontrado no Semantic Scholar")
                continue

            article_url = doi_api + str(
                artigo['doi'])  #obtendo data de publicacao
            r = requests.get(url=article_url)
            response = r.json()
            publication_date = response["values"][0]["timestamp"].split('T')[
                0]  #data de publicacao
Пример #22
0
file = open(auth_name_file, 'w', encoding='utf8')

print("Analisi in corso...")
for paper in author['papers']:

    paper_year = paper['year']
    if (paper_year != None):
        paper_title = str(paper['title'])
        paper_title = paper_title.replace(',', '')

        count += 1
        CSVstring = str(count) + "," + name + "," + str(
            paper_year) + "," + paper_title
        file.write(CSVstring)

        paper = sch.paper(paper['paperId'])
        team = "team_" + str(count)
        for author in paper['authors']:
            if (str(author['authorId']) != iD_auth):
                file.write("\n")
                CSVstring = str(count) + "," + str(author['name']) + "," + str(
                    paper_year) + "," + paper_title + "," + team
                file.write(CSVstring)

        print("Analisi documento numero: " + str(count))
        if (count > numbers_of_papers - 1):
            print("Analisi completata")
            break

        file.write("\n")
    else: