Exemplo n.º 1
0
    def test_ScraperAPI(self):
        proxy_generator = ProxyGenerator()
        proxy_generator.ScraperAPI(os.getenv('SCRAPER_API_KEY'))
        scholarly.set_timeout(60)

        ## Uses another method to test that proxy is working.
        self.test_search_keyword()
    def get_research_articles(self, max_num):
        # Search string for Google Scholar to look for.
        # e.g. "{self.title} {self.director.name}" would equate to "Concussion Peter Landesman" for the movie Concussion.
        search_str = f'{self.title} {self.director.name}'
        output = f""
        try:
            pg = ProxyGenerator()
            ip = os.environ['PROXY_IP']
            pg.SingleProxy(http=ip, https=ip)
            o = scholarly.use_proxy(pg)
            search_query = scholarly.search_pubs(search_str)
            for i in range(0, max_num):
                curr = next(search_query)

                # For debugging purposes, this is how you pretty print the search query's contents.
                #scholarly.pprint(curr)

                # Grab the title of the article.
                title = curr['bib']['title']

                # Begin our formatted html output for each found research article.
                output += f"""
                    <li>
                """

                # See if a publication url (i.e. curr['pub_url']) exists. If so, add an external link to it.
                if 'pub_url' in curr:
                    output += f"""
                        <a target='_blank' href=\"{curr['pub_url']}\">{title}</a>
                    """
                else:
                    output += f"""
                        {title}
                    """

                output += f"""
                    <br>
                """

                # Writes the abstract (i.e.curr['bib']['abstract']) if it exists.
                if 'bib' in curr and 'abstract' in curr['bib']:
                    output += f"""
                        <p>{curr['bib']['abstract']}</p>
                    """

                output += f"""
                </li>
                """
        except Exception as e:
            pass
            # Useful for seeing errors in your terminal. Replace pass with the print statement below.
            #print(sys.stderr, e)
        return output
Exemplo n.º 3
0
    def test_tor_launch_own_process(self):
        """
        Test that we can launch a Tor process
        """
        proxy_generator = ProxyGenerator()
        if sys.platform.startswith("linux"):
            tor_cmd = 'tor'
        elif sys.platform.startswith("win"):
            tor_cmd = 'tor.exe'

        tor_sock_port = random.randrange(9000, 9500)
        tor_control_port = random.randrange(9500, 9999)

        result = proxy_generator.Tor_Internal(tor_cmd, tor_sock_port, tor_control_port)
        self.assertTrue(result["proxy_works"])
        self.assertTrue(result["refresh_works"])
        self.assertEqual(result["tor_control_port"], tor_control_port)
        self.assertEqual(result["tor_sock_port"], tor_sock_port)
        # Check that we can issue a query as well
        query = 'Ipeirotis'
        scholarly.use_proxy(proxy_generator)
        authors = [a for a in scholarly.search_author(query)]
        self.assertGreaterEqual(len(authors), 1)
Exemplo n.º 4
0
    def setUp(self):
        proxy_generator = ProxyGenerator()
        if "CONNECTION_METHOD" in scholarly.env:
            self.connection_method = os.getenv("CONNECTION_METHOD")
        else:
            self.connection_method = "none"
        if self.connection_method == "tor":
            tor_sock_port = None
            tor_control_port = None
            tor_password = "******"
            # Tor uses the 9050 port as the default socks port
            # on windows 9150 for socks and 9151 for control
            if sys.platform.startswith("linux") or sys.platform.startswith(
                    "darwin"):
                tor_sock_port = 9050
                tor_control_port = 9051
            elif sys.platform.startswith("win"):
                tor_sock_port = 9150
                tor_control_port = 9151
            proxy_generator.Tor_External(tor_sock_port, tor_control_port,
                                         tor_password)
            scholarly.use_proxy(proxy_generator)

        elif self.connection_method == "tor_internal":
            if sys.platform.startswith("linux"):
                tor_cmd = 'tor'
            elif sys.platform.startswith("win"):
                tor_cmd = 'tor.exe'
            proxy_generator.Tor_Internal(tor_cmd=tor_cmd)
            scholarly.use_proxy(proxy_generator)
        elif self.connection_method == "luminati":
            scholarly.set_retries(10)
            proxy_generator.Luminati(usr=os.getenv("USERNAME"),
                                     passwd=os.getenv("PASSWORD"),
                                     proxy_port=os.getenv("PORT"))
            scholarly.use_proxy(proxy_generator)
        elif self.connection_method == "freeproxy":
            proxy_generator.FreeProxies()
            scholarly.use_proxy(proxy_generator)
        else:
            scholarly.use_proxy(None)
Exemplo n.º 5
0
        else:
            continue
    return file_list


def get_ids():
    files_list = get_articles_files_list(ARTICLES_INPUT_FOLDER)
    print(files_list)
    for file in files_list:
        get_author_ids_for_file(file)
        break


print("Started connection to tor !")

pg = ProxyGenerator()
pg.Tor_Internal(tor_cmd='tor')
scholarly.use_proxy(pg)

print("Connection to tor done successfully !")
get_author_ids_for_file('articles3_copy.csv')

# get_ids()
"""
### the following seciton adds the missing columns to the publications csv files
"""


def add_columns_to_publications():
    files_list = get_articles_files_list(ARTICLES_INPUT_FOLDER)
    for file in files_list:
from futurewater.crossref import get_publication
from futurewater.util import format_author

MAX_RETRIES_ON_ERROR = 3

# https://scholarly.readthedocs.io/en/latest/quickstart.html#installation
# https://github.com/scholarly-python-package/scholarly
# https://github.com/OpenAPC/openapc-de/blob/master/python/import_dois.py

logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)

logger = logging.getLogger()

pg = ProxyGenerator()
pg.Tor_External(tor_sock_port=9050,
                tor_control_port=9051,
                tor_password="******")
scholarly.use_proxy(pg)


def get_schoolar_data(author_name,
                      cache_folder="scholarly",
                      affiliation='UBC'):
    output_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                 "..", "resources", cache_folder)
    cached = os.path.join(output_folder, format_author(author_name))
    from_cache = False
    final_data = []
    if not os.path.isfile(cached):
Exemplo n.º 7
0
import arxiv
import os
import glob
from googlesearch import search
import tarfile
from scholarly import scholarly, ProxyGenerator
from functools import lru_cache
import re
import time
import random

paper_download_dir = './papers'

pg = ProxyGenerator()
pg.Tor_Internal(tor_cmd="tor")
scholarly.use_proxy(pg)


class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

Exemplo n.º 8
0
from scholarly import scholarly, ProxyGenerator
import json
from dotenv import load_dotenv
from pathlib import Path
import os
env_path = Path('../') / '.env'
load_dotenv(dotenv_path=env_path)
SCRAPER = os.getenv("SCRAPER")

proxy_generator = ProxyGenerator()
proxy_generator.ScraperAPI(SCRAPER)
scholarly.set_timeout(60)
scholarly.use_proxy(proxy_generator)

search_query = scholarly.search_author('Maël Montévil')

author = scholarly.fill(next(search_query))

pubs = [
    scholarly.fill(pub) for pub in author['publications']
    if (pub['num_citations'] > 0)
]

pubs2 = [[pub, (list(scholarly.citedby(pub)))] for pub in pubs
         if 'citedby_url' in pub]

print(json.dumps(pubs2, indent=2, default=lambda o: '<not serializable>'))
Exemplo n.º 9
0
from scholarly import scholarly, ProxyGenerator
import json
from dotenv import load_dotenv
from pathlib import Path
from fp.fp import FreeProxy

proxy_generator = ProxyGenerator()
proxy_generator.FreeProxies()
scholarly.use_proxy(proxy_generator)

search_query = scholarly.search_author('Maël Montévil')

author = scholarly.fill(next(search_query))

pubs = [
    scholarly.fill(pub) for pub in author['publications']
    if (pub['num_citations'] > 0)
]

pubs2 = [[pub, (list(scholarly.citedby(pub)))] for pub in pubs
         if 'citedby_url' in pub]

print(json.dumps(pubs2, indent=2, default=lambda o: '<not serializable>'))
Exemplo n.º 10
0
from scholarly import scholarly, ProxyGenerator
import json
from dotenv import load_dotenv
from pathlib import Path

proxy_generator = ProxyGenerator()
proxy_generator.Tor_Internal(tor_cmd='tor')
scholarly.use_proxy(proxy_generator)

search_query = scholarly.search_author('Maël Montévil')

author = scholarly.fill(next(search_query))

pubs = [
    scholarly.fill(pub) for pub in author['publications']
    if (pub['num_citations'] > 0)
]

pubs2 = []
for pub in pubs:
    if 'citedby_url' in pub:
        pubs2 = [pubs2, [pub, (list(scholarly.citedby(pub)))]]

print(json.dumps(pubs2, indent=2, default=lambda o: '<not serializable>'))
Exemplo n.º 11
0
import re
import pandas
import numpy as np
import pandas as pd
import pickle as pkl
import random as rand
import time
from scholarly import scholarly

from fp.fp import FreeProxy
from scholarly import scholarly

from scholarly import ProxyGenerator
pg = ProxyGenerator()

#proxy = FreeProxy(rand=True, timeout=10, country_id=['BR', 'US']).get()
#pg.SingleProxy(http=proxy, https=proxy)
#scholarly.use_proxy(pg)

query = """(automated OR automation) AND ("building code" OR compliance) AND (check OR validation) AND (AI OR "Machine Learning" OR NLP OR "Artificial Intelligence") AND (Building OR Construction OR "Town Planning")"""


# safely get data from a dictionary object
def safe_get_key(x: dict, key: str):
    try:
        value = x[key]
    except:
        value = None

    return value
Exemplo n.º 12
0
from scholarly import scholarly
from OSMPythonTools.nominatim import Nominatim
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
from fp.fp import FreeProxy
from scholarly import ProxyGenerator
from time import sleep

pg = ProxyGenerator()
proxy = FreeProxy(rand=True, timeout=1, country_id=['BR']).get()
pg.SingleProxy(http=proxy, https=proxy)
scholarly.use_proxy(pg)


def plot_citations(author_name):
    m = Basemap(projection='mill', lon_0=180)
    m.drawmapboundary(fill_color='aqua')
    m.fillcontinents(color='coral', lake_color='aqua')

    search_query = scholarly.search_author(author_name)
    author = next(search_query).fill()
    print(author)
    for pub in [author.publications[0]]:
        print('Title: ', pub.bib['title'])
        pub = pub.fill()
        sleep(45)
        for citation in pub.citedby:
            print(citation)
            sleep(45)
            firstAuthorId = None
            while firstAuthorId is None or len(citation.bib['author_id']) == 0:
Exemplo n.º 13
0
#!env python

from scholarly import scholarly
from scholarly import ProxyGenerator
import fileinput
import sys

pg = ProxyGenerator()
pg.Tor_External(9050, 9051, 'password')
scholarly.use_proxy(pg)

for a in fileinput.input():
    if a == "":
        continue

    try:
        search_query = scholarly.search_pubs(a)
        aa = next(search_query).fill()
        print(a.rstrip(), end='')
        bib = aa.bib
        print("," + str(bib['gsrank']), end='')
        print("," + str(bib['cites']), end='')
        print("")
    except:
        print(" --- Unexpected error (" + a + "): ", sys.exc_info()[0])
        pass

Exemplo n.º 14
0
def set_proxy():
    if ALLOW_PROXY_ON_SCHOLAR:
        pg = ProxyGenerator()
        pg.SingleProxy(http_proxy, https_proxy)
        scholarly.use_proxy(pg)
Exemplo n.º 15
0
    async def __call__(self):
        UserCancel = KeyboardInterrupt
        
        # region various embed types creation
        def publication_embeds(result) -> discord.Embed:
            embed = discord.Embed(
                title=result["bib"]["title"],
                description=result["bib"]["abstract"],
                url=result["eprint_url"]
                if "eprint_url" in result.keys()
                else result["pub_url"],
            )
            embed.add_field(
                name="Authors",
                value=", ".join(result["bib"]["author"]).strip(),
                inline=True,
            )

            embed.add_field(name="Publisher", value=result["bib"]["venue"], inline=True)
            embed.add_field(
                name="Publication Year", value=result["bib"]["pub_year"], inline=True
            )
            embed.add_field(
                name="Cited By",
                value=result["num_citations"]
                if "num_citations" in result.keys()
                else "0",
                inline=True,
            )

            embed.add_field(
                name="Related Articles",
                value=f'https://scholar.google.com{result["url_related_articles"]}',
                inline=True,
            )

            embed.set_footer(text=f"Requested by {self.ctx.author}")
            return embed

        def author_embeds(result) -> discord.Embed:
            embed = discord.Embed(title=result["name"])
            embed.add_field(
                name="Cited By", value=f"{result['citedby']} articles", inline=True
            )
            embed.add_field(name="Scholar ID", value=result["scholar_id"], inline=True)
            embed.add_field(
                name="Affiliation",
                value=result["affiliation"]
                if "affiliation" in result.keys()
                else "None",
                inline=True,
            )
            embed.add_field(
                name="Interests",
                value=f"{', '.join(result['interests']) if 'interests' in result.keys() else 'None'}",
                inline=True,
            )
            embed.set_image(url=result["url_picture"])
            embed.set_footer(text=f"Requested by {self.ctx.author}")
            return embed

        def citation_embeds(result) -> discord.Embed:
            embed = discord.Embed(
                title=result["bib"]["title"],
                description=f"```{scholarly.bibtex(result)}```",
                url=result["eprint_url"]
                if "eprint_url" in result.keys()
                else result["pub_url"],
            )
            embed.set_footer(text=f"Requested by {self.ctx.author}")
            return embed

        # endregion

        try:
            # region user flags processing

            pg = ProxyGenerator()
            proxy = FreeProxy(rand=True, timeout=1, country_id=["BR"]).get()
            pg.SingleProxy(http=proxy, https=proxy)
            scholarly.use_proxy(pg)

            # self.args processing
            if self.args is None:
                results = [next(scholarly.search_pubs(self.query)) for _ in range(5)]
                embeds = list(map(publication_embeds, results))
            elif "author" in self.args:
                results = [
                    next(scholarly.search_author(self.query)) for _ in range(5)
                ]
                embeds = list(map(author_embeds, results))
            elif "cite" in self.args:
                results = scholarly.search_pubs(self.query)
                results = [results for _ in range(5)]
                embeds = list(map(citation_embeds, results))
            else:
                await self.message.edit(content="Invalid flag")
                return
            # endregion

            # sets the reactions for the search result
            if len(embeds) > 1:
                buttons = [[
                    {Button(style=ButtonStyle.grey, label="◀️", custom_id="◀️"): None},
                    {Button(style=ButtonStyle.red, label="🗑️", custom_id="🗑️"): None},
                    {Button(style=ButtonStyle.grey, label="▶️", custom_id="▶️"): None}
                ]]
            else:
                buttons = [[
                    Button(style=ButtonStyle.red, label="🗑️", custom_id="🗑️")
                ]]

            await Sudo.multi_page_system(self.bot, self.ctx, self.message, tuple(embeds), buttons)
            return

        except asyncio.TimeoutError:
            raise
        except (asyncio.CancelledError, discord.errors.NotFound):
            pass
        except scholarly_exceptions._navigator.MaxTriesExceededException:
            await self.message.edit(
                content="Google Scholar is currently blocking our requests. Please try again later"
            )
            Log.append_to_log(self.ctx, f"{self.ctx.command} error", "MaxTriesExceededException")
            return

        except Exception as e:
            await error_handler(self.bot, self.ctx, e, self.query)
        finally:
            return
Exemplo n.º 16
0
from scholarly import scholarly
import yaml

# something like ssh -D 9050 -q -C -N [email protected]
from scholarly import scholarly, ProxyGenerator
# default values are shown below
proxies = {
    'http': 'socks5://127.0.0.1:9050',
    'https': 'socks5://127.0.0.1:9050'
}
pg = ProxyGenerator()
pg.SingleProxy(**proxies)

scholarly.use_proxy(pg)

# Retrieve the author's data, fill-in, and print
#author=scholarly.search_author_id('4poYWhEAAAAJ')
search_query = scholarly.search_author('Vassil Vassilev')

while True:
    print("Iter")
    try:
        author = next(search_query).fill()
        if 'cern' in author.email: break
        #print(author)
    except StopIteration:
        break
#sys.exit(1)
print(author)

print("Titles")
Exemplo n.º 17
0
from collections import defaultdict

from scholarly import scholarly, ProxyGenerator
from tqdm import tqdm
from yattag import Doc, indent

# Settings
PEOPLE = [
    "James O'Shea", "Alex Saywell", "Philip Moriarty", "Peter Beton",
    "James Sharp"
]
OUTPUT_DIR = "D:/Nano Group Page/all_pubs"
MIN_YEAR = 1990

# Setup proxy to avoid ignored requests
pg = ProxyGenerator()
scholarly.use_proxy(pg.FreeProxies())

# Preallocate
pubs_by_year = defaultdict(
    list)  # Defaultdict creates entries if no already existing, so can append.
pubs = []

# Get all publications in an unordered list
for p in PEOPLE:
    search_query = scholarly.search_author(f'{p}, Nottingham')
    author = next(search_query)
    info = scholarly.fill(author, sections=['publications'])
    pubs.append(info["publications"])
pubs = functools.reduce(operator.iconcat, pubs, [])