示例#1
0
def fetch_citations(author,
                    filesave="citations.json",
                    proxy="",
                    proxy_list=""):
    """ Fetch citations from google scholar using scholarly """
    if proxy != "":
        print("Setting up proxy ", proxy)
        scholarly.use_proxy(scholarly.SingleProxy(http=proxy, https=proxy))
    if proxy_list != "":
        lproxies = open(proxy_list, 'r').readlines()

        def proxy_gen():
            if proxy_gen.counter >= len(lproxies):
                raise IndexError("We ran out of proxies...")
            proxy = lproxies[proxy_gen.counter]
            if not proxy.startswith("http"):
                proxy = "http://" + proxy
            proxy_gen.counter += 1
            return proxy

        proxy_gen.counter = 0
        scholarly.use_proxy(proxy_gen)

    print("Looking up " + author)
    search = scholarly.search_author(author)
    author = scholarly.fill(next(search))
    publications = []

    for i, pub in enumerate(author['publications']):
        cites = pub['num_citations']  # often this gets messed up upon .fill()
        if "pub_year" in pub['bib']:
            pubyear = pub['bib'][
                "pub_year"]  # also this gets messed up upon .fill()
            pub = scholarly.fill(pub)
            pub['bib']["pub_year"] = pubyear
        else:
            pub = scholarly.fill(pub)
            if not "pub_year" in pub.bib:
                # skip publications that really don't have a year,
                # they probably are crap that was picked up by the search robot
                continue

        pub['num_citations'] = cites
        print("Fetching: " + str(i) + "/" + str(len(author['publications'])) +
              ": " + pub['bib']["title"] + " (" + str(pub['bib']["pub_year"]) +
              ")")
        pub['bib'].pop("abstract", None)
        pub.pop("source", None)
        publications.append(pub)
    f = open(filesave, "w")
    f.write(json.dumps(publications))
    f.close()
示例#2
0
def get_new_proxy():
    proxy_works = False
    while not proxy_works:
        proxy = FreeProxy(country_id=["US"], rand=True, timeout=1).get()
        proxy_works = scholarly.use_proxy(http=proxy, https=proxy)
    print("Found new proxy!")
    return proxy
示例#3
0
def set_new_proxy():
    while True:
        proxy = FreeProxy(rand=True, timeout=1).get()
        proxy_works = scholarly.use_proxy(http=proxy, https=proxy)
        if proxy_works:
            break
    return proxy
def set_new_proxy(text=True):
    """
    Reset the identity using FreeProxy
    Parameters
    ----------
    arg1 [OPTIONAL]| text: bool
        A boolean flag to return the IP address tuple (old, morphed)
    Returns
    -------
    Address
        fp.fp.FreeProxy
    """
    while True:
        # call the freeproxy object
        proxy = FreeProxy(rand=True, timeout=1).get()

        # allocate the proxy address to scholarly
        proxy_works = scholarly.use_proxy(http=proxy, https=proxy)

        # check it the ip address works
        if proxy_works:
            # come out
            break

    # print the ip address depending on the text argument
    if text:
        # print the working ip
        print("Working proxy:", proxy)

    # return the proxy details
    return proxy
示例#5
0
 def _set_new_proxy(self):
     while True:
         proxy = FreeProxy(rand=True, timeout=1).get()
         proxy_works = scholarly.use_proxy(http=proxy, https=proxy)
         if proxy_works:
             break
     print("Working proxy:", proxy)
     return proxy
def set_new_proxy():
    while True:
        proxy = FreeProxy().get()
        proxy_works = scholarly.use_proxy(
            http="http://123.179.163.100:53954",
            https="https://123.179.163.100:53954")
        if proxy_works:
            break
    print("Working proxy:", "http://123.179.163.100:53954")
    return proxy
示例#7
0
    def proxy(self):
        proxy_works = scholarly.use_proxy(
            http=
            "http://29ea0d9d66134811b51ead72601a1181:@proxy.crawlera.com:8010/"
        )
        print(proxy_works)

        test_query = scholarly.search_pubs(
            'Perception of physical stability and center of mass of 3D objects'
        )
        print(test_query)
示例#8
0
    def setUp(self):
        proxy_generator = ProxyGenerator()
        if "CONNECTION_METHOD" in scholarly.env:
            self.connection_method = os.getenv("CONNECTION_METHOD")
        else:
            self.connection_method = "none"
        if self.connection_method == "tor":
            tor_sock_port = None
            tor_control_port = None
            tor_password = "******"
            # Tor uses the 9050 port as the default socks port
            # on windows 9150 for socks and 9151 for control
            if sys.platform.startswith("linux") or sys.platform.startswith(
                    "darwin"):
                tor_sock_port = 9050
                tor_control_port = 9051
            elif sys.platform.startswith("win"):
                tor_sock_port = 9150
                tor_control_port = 9151
            proxy_generator.Tor_External(tor_sock_port, tor_control_port,
                                         tor_password)
            scholarly.use_proxy(proxy_generator)

        elif self.connection_method == "tor_internal":
            if sys.platform.startswith("linux"):
                tor_cmd = 'tor'
            elif sys.platform.startswith("win"):
                tor_cmd = 'tor.exe'
            proxy_generator.Tor_Internal(tor_cmd=tor_cmd)
            scholarly.use_proxy(proxy_generator)
        elif self.connection_method == "luminati":
            scholarly.set_retries(10)
            proxy_generator.Luminati(usr=os.getenv("USERNAME"),
                                     passwd=os.getenv("PASSWORD"),
                                     proxy_port=os.getenv("PORT"))
            scholarly.use_proxy(proxy_generator)
        elif self.connection_method == "freeproxy":
            proxy_generator.FreeProxies()
            scholarly.use_proxy(proxy_generator)
        else:
            scholarly.use_proxy(None)
    def get_research_articles(self, max_num):
        # Search string for Google Scholar to look for.
        # e.g. "{self.title} {self.director.name}" would equate to "Concussion Peter Landesman" for the movie Concussion.
        search_str = f'{self.title} {self.director.name}'
        output = f""
        try:
            pg = ProxyGenerator()
            ip = os.environ['PROXY_IP']
            pg.SingleProxy(http=ip, https=ip)
            o = scholarly.use_proxy(pg)
            search_query = scholarly.search_pubs(search_str)
            for i in range(0, max_num):
                curr = next(search_query)

                # For debugging purposes, this is how you pretty print the search query's contents.
                #scholarly.pprint(curr)

                # Grab the title of the article.
                title = curr['bib']['title']

                # Begin our formatted html output for each found research article.
                output += f"""
                    <li>
                """

                # See if a publication url (i.e. curr['pub_url']) exists. If so, add an external link to it.
                if 'pub_url' in curr:
                    output += f"""
                        <a target='_blank' href=\"{curr['pub_url']}\">{title}</a>
                    """
                else:
                    output += f"""
                        {title}
                    """

                output += f"""
                    <br>
                """

                # Writes the abstract (i.e.curr['bib']['abstract']) if it exists.
                if 'bib' in curr and 'abstract' in curr['bib']:
                    output += f"""
                        <p>{curr['bib']['abstract']}</p>
                    """

                output += f"""
                </li>
                """
        except Exception as e:
            pass
            # Useful for seeing errors in your terminal. Replace pass with the print statement below.
            #print(sys.stderr, e)
        return output
示例#10
0
    def test_tor_launch_own_process(self):
        """
        Test that we can launch a Tor process
        """
        proxy_generator = ProxyGenerator()
        if sys.platform.startswith("linux"):
            tor_cmd = 'tor'
        elif sys.platform.startswith("win"):
            tor_cmd = 'tor.exe'

        tor_sock_port = random.randrange(9000, 9500)
        tor_control_port = random.randrange(9500, 9999)

        result = proxy_generator.Tor_Internal(tor_cmd, tor_sock_port, tor_control_port)
        self.assertTrue(result["proxy_works"])
        self.assertTrue(result["refresh_works"])
        self.assertEqual(result["tor_control_port"], tor_control_port)
        self.assertEqual(result["tor_sock_port"], tor_sock_port)
        # Check that we can issue a query as well
        query = 'Ipeirotis'
        scholarly.use_proxy(proxy_generator)
        authors = [a for a in scholarly.search_author(query)]
        self.assertGreaterEqual(len(authors), 1)
示例#11
0
    async def __call__(self):
        UserCancel = KeyboardInterrupt
        
        # region various embed types creation
        def publication_embeds(result) -> discord.Embed:
            embed = discord.Embed(
                title=result["bib"]["title"],
                description=result["bib"]["abstract"],
                url=result["eprint_url"]
                if "eprint_url" in result.keys()
                else result["pub_url"],
            )
            embed.add_field(
                name="Authors",
                value=", ".join(result["bib"]["author"]).strip(),
                inline=True,
            )

            embed.add_field(name="Publisher", value=result["bib"]["venue"], inline=True)
            embed.add_field(
                name="Publication Year", value=result["bib"]["pub_year"], inline=True
            )
            embed.add_field(
                name="Cited By",
                value=result["num_citations"]
                if "num_citations" in result.keys()
                else "0",
                inline=True,
            )

            embed.add_field(
                name="Related Articles",
                value=f'https://scholar.google.com{result["url_related_articles"]}',
                inline=True,
            )

            embed.set_footer(text=f"Requested by {self.ctx.author}")
            return embed

        def author_embeds(result) -> discord.Embed:
            embed = discord.Embed(title=result["name"])
            embed.add_field(
                name="Cited By", value=f"{result['citedby']} articles", inline=True
            )
            embed.add_field(name="Scholar ID", value=result["scholar_id"], inline=True)
            embed.add_field(
                name="Affiliation",
                value=result["affiliation"]
                if "affiliation" in result.keys()
                else "None",
                inline=True,
            )
            embed.add_field(
                name="Interests",
                value=f"{', '.join(result['interests']) if 'interests' in result.keys() else 'None'}",
                inline=True,
            )
            embed.set_image(url=result["url_picture"])
            embed.set_footer(text=f"Requested by {self.ctx.author}")
            return embed

        def citation_embeds(result) -> discord.Embed:
            embed = discord.Embed(
                title=result["bib"]["title"],
                description=f"```{scholarly.bibtex(result)}```",
                url=result["eprint_url"]
                if "eprint_url" in result.keys()
                else result["pub_url"],
            )
            embed.set_footer(text=f"Requested by {self.ctx.author}")
            return embed

        # endregion

        try:
            # region user flags processing

            pg = ProxyGenerator()
            proxy = FreeProxy(rand=True, timeout=1, country_id=["BR"]).get()
            pg.SingleProxy(http=proxy, https=proxy)
            scholarly.use_proxy(pg)

            # self.args processing
            if self.args is None:
                results = [next(scholarly.search_pubs(self.query)) for _ in range(5)]
                embeds = list(map(publication_embeds, results))
            elif "author" in self.args:
                results = [
                    next(scholarly.search_author(self.query)) for _ in range(5)
                ]
                embeds = list(map(author_embeds, results))
            elif "cite" in self.args:
                results = scholarly.search_pubs(self.query)
                results = [results for _ in range(5)]
                embeds = list(map(citation_embeds, results))
            else:
                await self.message.edit(content="Invalid flag")
                return
            # endregion

            # sets the reactions for the search result
            if len(embeds) > 1:
                buttons = [[
                    {Button(style=ButtonStyle.grey, label="◀️", custom_id="◀️"): None},
                    {Button(style=ButtonStyle.red, label="🗑️", custom_id="🗑️"): None},
                    {Button(style=ButtonStyle.grey, label="▶️", custom_id="▶️"): None}
                ]]
            else:
                buttons = [[
                    Button(style=ButtonStyle.red, label="🗑️", custom_id="🗑️")
                ]]

            await Sudo.multi_page_system(self.bot, self.ctx, self.message, tuple(embeds), buttons)
            return

        except asyncio.TimeoutError:
            raise
        except (asyncio.CancelledError, discord.errors.NotFound):
            pass
        except scholarly_exceptions._navigator.MaxTriesExceededException:
            await self.message.edit(
                content="Google Scholar is currently blocking our requests. Please try again later"
            )
            Log.append_to_log(self.ctx, f"{self.ctx.command} error", "MaxTriesExceededException")
            return

        except Exception as e:
            await error_handler(self.bot, self.ctx, e, self.query)
        finally:
            return
示例#12
0
# Parse the author names
file_in = sys.argv[1]
authornames = []
with open(file_in, 'r') as f:
    for line in f:
        line = line.split('\n')[0]
        authornames.append(line)

# Indicate what data to get (see Author class in https://pypi.org/project/scholarly/)
sections = ['basics', 'indices']
max_homonyms = 5

#pip install free-proxy
from fp.fp import FreeProxy
proxy = FreeProxy(rand=True, timeout=1, country_id=['NO']).get()
scholarly.use_proxy(http=proxy, https=proxy)

# Loop through the authors
t0 = time.time()
data = list({})
for i, authname in enumerate(authornames):
    hindices = []
    emails, names, affiliations, citedbys = [], [], [], []
    try:
        search_query = scholarly.search_author(authname)
        for _ in range(max_homonyms):
            try:
                author = next(search_query)
                tmp_data = author.fill(sections=sections)
                hindices.append(tmp_data.hindex)
                emails.append(tmp_data.email)
示例#13
0
import sys
from scholarly import scholarly
import time
from tqdm import tqdm
import pickle as pkl

from stem import Signal
from stem.control import Controller
import requests

proxies = {
    'http': 'socks5://127.0.0.1:9050',
    'https': 'socks5://127.0.0.1:9050'
}
scholarly.use_proxy(**proxies)


def refresh_socket():
    print(requests.get('https://ident.me', proxies=proxies).text)
    with Controller.from_port(port=9051) as c:
        c.authenticate()
        c.signal(Signal.NEWNYM)
    print(requests.get('https://ident.me', proxies=proxies).text)


# RL - reinforcement learning
# CF - catastrophic forgetting
# STS - semantic textual similarity
# NLI - natural language inference (same as recognizing textual entailment)
# MC - machine comphrehension
示例#14
0
# FLAME GPU publications
flame_pubs = [
    'High performance cellular level agent-based simulation with FLAME for the GPU',
    'FLAME: simulating large populations of agents on parallel hardware architectures',
    'A high performance agent based modelling framework on graphics card hardware with CUDA',
    'Template-driven agent-based modeling and simulation with CUDA',
    'Simulating heterogeneous behaviours in complex systems on GPUs',
    'FLAME GPU technical report and user guide (CS-11-03)',
    'Resolving conflicts between multiple competing agents in parallel simulations'
]

# Free proxies get blocked
#proxy_generator = ProxyGenerator()
#proxy_generator.FreeProxies()
scholarly.use_proxy(None)

# open file for dumping flame publication details
f_pubs = open("_data/publications.yml", "w")
f_cites = open("_data/citations.yml", "w")

all_pubs = []
all_cites = []
for paper_title in flame_pubs:
    results = scholarly.search_pubs(paper_title)
    pubs = [p for p in results]
    assert len(pubs) > 0  # Paper not found?
    print(f"Found '{paper_title}'.")

    # fill by querying site
    pub = scholarly.fill(pubs[0])
示例#15
0
def set_proxy():
    if ALLOW_PROXY_ON_SCHOLAR:
        pg = ProxyGenerator()
        pg.SingleProxy(http_proxy, https_proxy)
        scholarly.use_proxy(pg)
示例#16
0
from scholarly import scholarly, ProxyGenerator
import json
from dotenv import load_dotenv
from pathlib import Path
import os
env_path = Path('../') / '.env'
load_dotenv(dotenv_path=env_path)
SCRAPER = os.getenv("SCRAPER")

proxy_generator = ProxyGenerator()
proxy_generator.ScraperAPI(SCRAPER)
scholarly.set_timeout(60)
scholarly.use_proxy(proxy_generator)

search_query = scholarly.search_author('Maël Montévil')

author = scholarly.fill(next(search_query))

pubs = [
    scholarly.fill(pub) for pub in author['publications']
    if (pub['num_citations'] > 0)
]

pubs2 = [[pub, (list(scholarly.citedby(pub)))] for pub in pubs
         if 'citedby_url' in pub]

print(json.dumps(pubs2, indent=2, default=lambda o: '<not serializable>'))
示例#17
0
from scholarly import scholarly, ProxyGenerator
from tqdm import tqdm
from yattag import Doc, indent

# Settings
PEOPLE = [
    "James O'Shea", "Alex Saywell", "Philip Moriarty", "Peter Beton",
    "James Sharp"
]
OUTPUT_DIR = "D:/Nano Group Page/all_pubs"
MIN_YEAR = 1990

# Setup proxy to avoid ignored requests
pg = ProxyGenerator()
scholarly.use_proxy(pg.FreeProxies())

# Preallocate
pubs_by_year = defaultdict(
    list)  # Defaultdict creates entries if no already existing, so can append.
pubs = []

# Get all publications in an unordered list
for p in PEOPLE:
    search_query = scholarly.search_author(f'{p}, Nottingham')
    author = next(search_query)
    info = scholarly.fill(author, sections=['publications'])
    pubs.append(info["publications"])
pubs = functools.reduce(operator.iconcat, pubs, [])

# For every publication
示例#18
0
from scholarly import scholarly
import yaml

# something like ssh -D 9050 -q -C -N [email protected]
from scholarly import scholarly, ProxyGenerator
# default values are shown below
proxies = {
    'http': 'socks5://127.0.0.1:9050',
    'https': 'socks5://127.0.0.1:9050'
}
pg = ProxyGenerator()
pg.SingleProxy(**proxies)

scholarly.use_proxy(pg)

# Retrieve the author's data, fill-in, and print
#author=scholarly.search_author_id('4poYWhEAAAAJ')
search_query = scholarly.search_author('Vassil Vassilev')

while True:
    print("Iter")
    try:
        author = next(search_query).fill()
        if 'cern' in author.email: break
        #print(author)
    except StopIteration:
        break
#sys.exit(1)
print(author)

print("Titles")
示例#19
0
def scrape_scholar(query, pages=0, max_proxy_tries=5):
    '''  
    Name: scrape_scholar
    Description: Searches Google Scholar using query and returns data for results.
    Input:
    @query: search term
    @pages: number of pages (10 articles per page) to request
    @start_year: minimum number of words in body of text
    @log_path: file path for where to create log file
    Output: A pandas DataFrame with one paper per row
    '''

    generator = FreeProxy(rand=True)

    page_size = 10

    # create log file to write errors to
    log = open(f'{query}' + log_path + '.txt', 'w+')

    # initialize list which will contain all article data and be used for DataFrame
    rows = []

    # the number of the current result being pulled from google scholar
    index = 0

    results = str(1)
    
    num_tries = 0
    while num_tries<max_proxy_tries:
        # try-catch block that allows errors to be written in log file if they occur
        try:
            # proxy = generator.get()
            # print(proxy)

            # pg = ProxyGenerator()
            # pg.SingleProxy(http = "http://157.245.203.17:3128")
            scholarly.use_proxy(None)

            # creates a generator object for results for the query
            results = scholarly.search_pubs(query) #, start=0)

            # detects whether the limit has been passed, if there is one
            while not pages or index<page_size*pages:

                result = next(results)

                # retrieves current results object
                curr_result_bib = result.bib

                #instantiates current row container
                row = dict()

                # passes link to article
                row['Link'] = curr_result['url'] if 'url' in curr_result else np.nan

                # title of paper, removes quotes at the start and end if there
                row['Title'] = curr_result['title'] if 'title' in curr_result else np.nan

                # True if pdf is available, False otherwise
                # row['Accessible'] = bool(paper['repositoryDocument']['pdfStatus'])

                # page number paper would be on on the website assuming 10 papers per page
                row['Page number'] = index//page_size + 1

                # list of [initials last-name]
                row['Authors'] = curr_result['author'] if 'author' in curr_result else np.nan
                
                # checks published year
                row['Publish year'] = int(curr_result['year']) if 'year' in curr_result else np.nan

                # number of citations
                row['Citations'] = curr_result['cites'] if 'cites' in curr_result else np.nan

                # links to related articles
                row['Related articles'] = 'https://scholar.google.com/scholar?q=related:' + results['url_scholarbib'].split(':')[1] + ':scholar.google.com/&scioq=' + query + '&hl=en&as_sdt=0,14'

                # checks if publisher is available
                row['Publisher'] = curr_result['venue'] if 'venue' in curr_result else np.nan

                rows.append(row)
                index += 1
            # returns pandas DataFrame where each row is 1 paper
            return pd.DataFrame(rows)

        # write any errors to log file
        except Exception as e:
            # log.write(str(e))
            # print(str(e))
            # traceback.print_exc(file=sys.stdout)
            # log.write('\n')
            if rows:
                return pd.DataFrame(rows)
            if str(e) == "Cannot fetch the page from Google Scholar.":
                num_tries += 1
                continue
            else:
                return pd.DataFrame(rows)
    # returns partially filled DataFrame if failed
    return pd.DataFrame(rows)