예제 #1
0
def get_papers_from_paper_citations(paper_title: str):
    """
        gets the papers that cited the paper given as a parameter
        it registers the found papers in articles folder and registres the citation 
        relationship in the citations folder 
    """
   
    target_paper_generator = scholarly.search_pubs(
        paper_title)  # search by title as a keyword

    print("=======> getting the rarget pater")
    target_paper = next(target_paper_generator)  # get the first result

    print('##########################')
    publications_generator = scholarly.citedby(target_paper)
    try:
        citations_count= 0
        while citations_count<=NB_MAX_CITATIONS_PER_PAPERS:
            
            publication = next(publications_generator)
            # filled_publication = scholarly.fill(publication)
            mydict = publication_to_dict(publication)
            write_publication(mydict, PUBLICATIONS_CSV_FILE_OUTPUT)
            register_citation(
                target_paper['citedby_url'], mydict['citedby_url'])
            citations_count+=1
    except Exception as e:
        raise e
예제 #2
0
def grab_related(cp, max_no=10):
    # grab max_no number of papers that are the highest cited amongst the
    #ones that cite the initial paper and their information
    cites = []
    count = 0
    for citation in tqdm(scholarly.citedby(cp), total=max_no):
        if count > max_no:
            break
        else:
            cites.append(citation)
        count += 1

    return cites
예제 #3
0
 def test_search_pubs_citedby(self):
     """
     Testing that when we retrieve the list of publications that cite
     a publication, the number of citing publication is the same as
     the number of papers that are returned. We use a publication
     with a small number of citations, so that the test runs quickly.
     The 'Machine-learned epidemiology' paper had 11 citations as of
     June 1, 2020.
     """
     query = 'Machine-learned epidemiology: real-time detection of foodborne illness at scale'
     pubs = [p for p in scholarly.search_pubs(query)]
     self.assertGreaterEqual(len(pubs), 1)
     filled = scholarly.fill(pubs[0])
     cites = [c for c in scholarly.citedby(filled)]
     self.assertEqual(len(cites), filled['num_citations'])
예제 #4
0
def download_citations():
    # Retrieve the author's data, fill-in, and print
    # search_query = scholarly.search_author(NAME)
    search_query = scholarly.search_author_id(AUTHOR_ID)
    # author = scholarly.fill(next(search_query))
    author = scholarly.fill(search_query)
    print(author)

    # Print the titles of the author's publications
    print([pub['bib']['title'] for pub in author['publications']])

    # Take a closer look at the first publication
    # pub = scholarly.fill(author['publications'][1])
    # print(pub)
    independent_citations = []
    for pub in author['publications'][:]:
        res_dict = {}
        time.sleep(random.randint(WAIT, WAIT * 2))
        pub = scholarly.fill(pub)
        res_dict["title"] = pub['bib']["title"]
        res_dict["year"] = pub['bib']["pub_year"]
        print(pub['bib']["title"])
        res_dict["author"] = [name.strip() for name in pub['bib']["author"].split("and")]
        time.sleep(random.randint(WAIT, WAIT * 2))
        cited_this = scholarly.citedby(pub)
        if cited_this:
            res_dict['cited_this'] = [{"author": citation['bib']["author"], "title": citation['bib']["title"]} for
                                      citation
                                      in
                                      cited_this]
            indep_citations = print_citations(res_dict)
            res_dict['independent_citations'] = indep_citations
            independent_citations.append(
                {"title": res_dict["title"], "author": res_dict["author"], 'independent_citations': indep_citations})
            save_json(res_dict['title'], res_dict)
        else:
            break

    save_json("independent_citations.json", independent_citations)
예제 #5
0
def search_cited_papers(pub):
    # only consider the most prominant ten papers.
    return [_ for _, __ in zip(scholarly.citedby(pub), range(10))]
예제 #6
0
from scholarly import scholarly, ProxyGenerator
import json
from dotenv import load_dotenv
from pathlib import Path
import os
env_path = Path('../') / '.env'
load_dotenv(dotenv_path=env_path)
SCRAPER = os.getenv("SCRAPER")

proxy_generator = ProxyGenerator()
proxy_generator.ScraperAPI(SCRAPER)
scholarly.set_timeout(60)
scholarly.use_proxy(proxy_generator)

search_query = scholarly.search_author('Maël Montévil')

author = scholarly.fill(next(search_query))

pubs = [
    scholarly.fill(pub) for pub in author['publications']
    if (pub['num_citations'] > 0)
]

pubs2 = [[pub, (list(scholarly.citedby(pub)))] for pub in pubs
         if 'citedby_url' in pub]

print(json.dumps(pubs2, indent=2, default=lambda o: '<not serializable>'))
예제 #7
0
all_pubs = []
all_cites = []
for paper_title in flame_pubs:
    results = scholarly.search_pubs(paper_title)
    pubs = [p for p in results]
    assert len(pubs) > 0  # Paper not found?
    print(f"Found '{paper_title}'.")

    # fill by querying site
    pub = scholarly.fill(pubs[0])
    all_pubs.append(pub)
    print(f"Details returned for  '{paper_title}'.")

    # get all publications that cite the current paper
    cites = [
        dict(c, **{'flame_paper': paper_title}) for c in scholarly.citedby(pub)
    ]
    all_cites.extend(cites)
    print(f"Found {len(cites)} citations for '{paper_title}'\n")

    # dump to file
    #f_pubs.write(yaml.dump([pubs]))
    #f_cites.write(yaml.dump([cites]))

# remove duplicates from citations list
unique_cites = []
for p in all_cites:
    if p not in unique_cites:
        unique_cites.append(p)

# remove cross refs to pubs
예제 #8
0
#anvil.server.wait_forever()
busca_publicaciones(
    ['MADAIN PEREZ PATRICIO', 'Abiel Aguilar-González', 'Steven A Cholewiak'])

# Retrieve the author's data, fill-in, and print
#print(author)

# Print the titles of the author's publications
#print([pub.bib['title'] for pub in author.publications])

# Take a closer look at the first publication
#pub = author.publications[0].fill()
#print(pub)

# Which papers cited that publication?
#print([citation.bib['title'] for citation in pub.citedby])
# Retrieve the author's data, fill-in, and print
search_query = scholarly.search_author('Steven A Cholewiak')
author = scholarly.fill(next(search_query))
print(author)

# Print the titles of the author's publications
print([pub['bib']['title'] for pub in author['publications']])

# Take a closer look at the first publication
pub = scholarly.fill(author['publications'][0])
print(pub)

# Which papers cited that publication?
print([citation['bib']['title'] for citation in scholarly.citedby(pub)])
예제 #9
0
from scholarly import scholarly, ProxyGenerator
import json
from dotenv import load_dotenv
from pathlib import Path

proxy_generator = ProxyGenerator()
proxy_generator.Tor_Internal(tor_cmd='tor')
scholarly.use_proxy(proxy_generator)

search_query = scholarly.search_author('Maël Montévil')

author = scholarly.fill(next(search_query))

pubs = [
    scholarly.fill(pub) for pub in author['publications']
    if (pub['num_citations'] > 0)
]

pubs2 = []
for pub in pubs:
    if 'citedby_url' in pub:
        pubs2 = [pubs2, [pub, (list(scholarly.citedby(pub)))]]

print(json.dumps(pubs2, indent=2, default=lambda o: '<not serializable>'))
예제 #10
0
print("Searching on Google scholar")

author = scholarly.search_author_id('_7AMrKgAAAAJ')  # _7AMrKgAAAAJ is Quasar

quasar_stats = scholarly.fill(
    author, sections=['basics', 'indices', 'counts', 'publications'])

scholarly.pprint(quasar_stats)

# What papers cited our publications?
cit = []
for pub in quasar_stats['publications']:
    print(pub)
    cit.append(
        [citation for citation in scholarly.citedby(pub)]
    )  # limit the number of test runs because this will get blocked bu Google quickly

print(
    f'There are currently {len(quasar_stats["publications"])} Quasar papers.')
for pub in quasar_stats['publications']:
    print(' ', pub['bib']['title'])

fcit = [item for sublist in cit for item in sublist]  # this is a flat list now
print(f'\nWe have {len(fcit)} citations so far for our Quasar papers.')

# I wonder if this can be done in fewer lines. :D
authors = [c["author_id"] for c in fcit]
citing_authors = [item for sublist in authors for item in sublist]
citing_authors = set([c for c in citing_authors
                      if c])  # citing authors with Google Scholar profile
예제 #11
0
# Retrieve the author's data, fill-in, and print
search_query = scholarly.search_author('Steven A Cholewiak')

author = next(search_query)
author_filled = scholarly.fill(author)

scholarly.pprint(author)

print(author)

# Take a closer look at the first publication
publication = author["publications"][0]
pub = scholarly.fill(publication)

# citations= pub.citedby()
citations_iterator = scholarly.citedby(pub)
i = 0
while i < 100:
    i += 1
    citations = next(citations_iterator)
    with open('citedby.txt', 'w+') as file:
        print(citations, file=file)

# print(pub)

# Which papers cited that publication?
# print([citation.bib['title'] for citation in pub.citedby])

# Free Proxy
# pg = ProxyGenerator()
# pg.FreeProxies()
예제 #12
0
class ScraperAPI(ProxyGenerator):
    def __init__(self, api_key):
        self._api_key = api_key
        self._client = ScraperAPIClient(api_key)

        assert api_key is not None

        super(ScraperAPI, self).__init__()

        self._TIMEOUT = 120
        self._session = self._client
        self._session.proxies = {}

    def _new_session(self):
        self.got_403 = False
        return self._session

    def _close_session(self):
        pass  # no need to close the ScraperAPI client


pg = ScraperAPI(SCRAPER)
scholarly.use_proxy(pg)
scholarly.set_timeout(120)

search_query = scholarly.search_author('Maël Montévil')
author = scholarly.fill(next(search_query))
pub = scholarly.fill(author['publications'][16])
print(pub)
print(list(scholarly.citedby(pub)))
예제 #13
0
from scholarly import scholarly, ProxyGenerator
import json 
from dotenv import load_dotenv
from pathlib import Path 


proxy_generator = ProxyGenerator()
proxy_generator.Tor_Internal(tor_cmd = 'tor')
scholarly.use_proxy(proxy_generator)

search_query = scholarly.search_author('Maël Montévil')

author = scholarly.fill(next(search_query))

pubs=[scholarly.fill(pub) for pub in author['publications'] if (pub['num_citations']>0)]



pubs2=[ [pub, (list(scholarly.citedby(pub)))] for pub in pubs if 'citedby_url' in pub]


print(json.dumps(pubs2,indent=2, default=lambda o: '<not serializable>'))