def test_ScraperAPI(self): proxy_generator = ProxyGenerator() proxy_generator.ScraperAPI(os.getenv('SCRAPER_API_KEY')) scholarly.set_timeout(60) ## Uses another method to test that proxy is working. self.test_search_keyword()
def get_research_articles(self, max_num): # Search string for Google Scholar to look for. # e.g. "{self.title} {self.director.name}" would equate to "Concussion Peter Landesman" for the movie Concussion. search_str = f'{self.title} {self.director.name}' output = f"" try: pg = ProxyGenerator() ip = os.environ['PROXY_IP'] pg.SingleProxy(http=ip, https=ip) o = scholarly.use_proxy(pg) search_query = scholarly.search_pubs(search_str) for i in range(0, max_num): curr = next(search_query) # For debugging purposes, this is how you pretty print the search query's contents. #scholarly.pprint(curr) # Grab the title of the article. title = curr['bib']['title'] # Begin our formatted html output for each found research article. output += f""" <li> """ # See if a publication url (i.e. curr['pub_url']) exists. If so, add an external link to it. if 'pub_url' in curr: output += f""" <a target='_blank' href=\"{curr['pub_url']}\">{title}</a> """ else: output += f""" {title} """ output += f""" <br> """ # Writes the abstract (i.e.curr['bib']['abstract']) if it exists. if 'bib' in curr and 'abstract' in curr['bib']: output += f""" <p>{curr['bib']['abstract']}</p> """ output += f""" </li> """ except Exception as e: pass # Useful for seeing errors in your terminal. Replace pass with the print statement below. #print(sys.stderr, e) return output
def test_tor_launch_own_process(self): """ Test that we can launch a Tor process """ proxy_generator = ProxyGenerator() if sys.platform.startswith("linux"): tor_cmd = 'tor' elif sys.platform.startswith("win"): tor_cmd = 'tor.exe' tor_sock_port = random.randrange(9000, 9500) tor_control_port = random.randrange(9500, 9999) result = proxy_generator.Tor_Internal(tor_cmd, tor_sock_port, tor_control_port) self.assertTrue(result["proxy_works"]) self.assertTrue(result["refresh_works"]) self.assertEqual(result["tor_control_port"], tor_control_port) self.assertEqual(result["tor_sock_port"], tor_sock_port) # Check that we can issue a query as well query = 'Ipeirotis' scholarly.use_proxy(proxy_generator) authors = [a for a in scholarly.search_author(query)] self.assertGreaterEqual(len(authors), 1)
def setUp(self): proxy_generator = ProxyGenerator() if "CONNECTION_METHOD" in scholarly.env: self.connection_method = os.getenv("CONNECTION_METHOD") else: self.connection_method = "none" if self.connection_method == "tor": tor_sock_port = None tor_control_port = None tor_password = "******" # Tor uses the 9050 port as the default socks port # on windows 9150 for socks and 9151 for control if sys.platform.startswith("linux") or sys.platform.startswith( "darwin"): tor_sock_port = 9050 tor_control_port = 9051 elif sys.platform.startswith("win"): tor_sock_port = 9150 tor_control_port = 9151 proxy_generator.Tor_External(tor_sock_port, tor_control_port, tor_password) scholarly.use_proxy(proxy_generator) elif self.connection_method == "tor_internal": if sys.platform.startswith("linux"): tor_cmd = 'tor' elif sys.platform.startswith("win"): tor_cmd = 'tor.exe' proxy_generator.Tor_Internal(tor_cmd=tor_cmd) scholarly.use_proxy(proxy_generator) elif self.connection_method == "luminati": scholarly.set_retries(10) proxy_generator.Luminati(usr=os.getenv("USERNAME"), passwd=os.getenv("PASSWORD"), proxy_port=os.getenv("PORT")) scholarly.use_proxy(proxy_generator) elif self.connection_method == "freeproxy": proxy_generator.FreeProxies() scholarly.use_proxy(proxy_generator) else: scholarly.use_proxy(None)
else: continue return file_list def get_ids(): files_list = get_articles_files_list(ARTICLES_INPUT_FOLDER) print(files_list) for file in files_list: get_author_ids_for_file(file) break print("Started connection to tor !") pg = ProxyGenerator() pg.Tor_Internal(tor_cmd='tor') scholarly.use_proxy(pg) print("Connection to tor done successfully !") get_author_ids_for_file('articles3_copy.csv') # get_ids() """ ### the following seciton adds the missing columns to the publications csv files """ def add_columns_to_publications(): files_list = get_articles_files_list(ARTICLES_INPUT_FOLDER) for file in files_list:
from futurewater.crossref import get_publication from futurewater.util import format_author MAX_RETRIES_ON_ERROR = 3 # https://scholarly.readthedocs.io/en/latest/quickstart.html#installation # https://github.com/scholarly-python-package/scholarly # https://github.com/OpenAPC/openapc-de/blob/master/python/import_dois.py logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) logger = logging.getLogger() pg = ProxyGenerator() pg.Tor_External(tor_sock_port=9050, tor_control_port=9051, tor_password="******") scholarly.use_proxy(pg) def get_schoolar_data(author_name, cache_folder="scholarly", affiliation='UBC'): output_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "resources", cache_folder) cached = os.path.join(output_folder, format_author(author_name)) from_cache = False final_data = [] if not os.path.isfile(cached):
import arxiv import os import glob from googlesearch import search import tarfile from scholarly import scholarly, ProxyGenerator from functools import lru_cache import re import time import random paper_download_dir = './papers' pg = ProxyGenerator() pg.Tor_Internal(tor_cmd="tor") scholarly.use_proxy(pg) class color: PURPLE = '\033[95m' CYAN = '\033[96m' DARKCYAN = '\033[36m' BLUE = '\033[94m' GREEN = '\033[92m' YELLOW = '\033[93m' RED = '\033[91m' BOLD = '\033[1m' UNDERLINE = '\033[4m' END = '\033[0m'
from scholarly import scholarly, ProxyGenerator import json from dotenv import load_dotenv from pathlib import Path import os env_path = Path('../') / '.env' load_dotenv(dotenv_path=env_path) SCRAPER = os.getenv("SCRAPER") proxy_generator = ProxyGenerator() proxy_generator.ScraperAPI(SCRAPER) scholarly.set_timeout(60) scholarly.use_proxy(proxy_generator) search_query = scholarly.search_author('Maël Montévil') author = scholarly.fill(next(search_query)) pubs = [ scholarly.fill(pub) for pub in author['publications'] if (pub['num_citations'] > 0) ] pubs2 = [[pub, (list(scholarly.citedby(pub)))] for pub in pubs if 'citedby_url' in pub] print(json.dumps(pubs2, indent=2, default=lambda o: '<not serializable>'))
from scholarly import scholarly, ProxyGenerator import json from dotenv import load_dotenv from pathlib import Path from fp.fp import FreeProxy proxy_generator = ProxyGenerator() proxy_generator.FreeProxies() scholarly.use_proxy(proxy_generator) search_query = scholarly.search_author('Maël Montévil') author = scholarly.fill(next(search_query)) pubs = [ scholarly.fill(pub) for pub in author['publications'] if (pub['num_citations'] > 0) ] pubs2 = [[pub, (list(scholarly.citedby(pub)))] for pub in pubs if 'citedby_url' in pub] print(json.dumps(pubs2, indent=2, default=lambda o: '<not serializable>'))
from scholarly import scholarly, ProxyGenerator import json from dotenv import load_dotenv from pathlib import Path proxy_generator = ProxyGenerator() proxy_generator.Tor_Internal(tor_cmd='tor') scholarly.use_proxy(proxy_generator) search_query = scholarly.search_author('Maël Montévil') author = scholarly.fill(next(search_query)) pubs = [ scholarly.fill(pub) for pub in author['publications'] if (pub['num_citations'] > 0) ] pubs2 = [] for pub in pubs: if 'citedby_url' in pub: pubs2 = [pubs2, [pub, (list(scholarly.citedby(pub)))]] print(json.dumps(pubs2, indent=2, default=lambda o: '<not serializable>'))
import re import pandas import numpy as np import pandas as pd import pickle as pkl import random as rand import time from scholarly import scholarly from fp.fp import FreeProxy from scholarly import scholarly from scholarly import ProxyGenerator pg = ProxyGenerator() #proxy = FreeProxy(rand=True, timeout=10, country_id=['BR', 'US']).get() #pg.SingleProxy(http=proxy, https=proxy) #scholarly.use_proxy(pg) query = """(automated OR automation) AND ("building code" OR compliance) AND (check OR validation) AND (AI OR "Machine Learning" OR NLP OR "Artificial Intelligence") AND (Building OR Construction OR "Town Planning")""" # safely get data from a dictionary object def safe_get_key(x: dict, key: str): try: value = x[key] except: value = None return value
from scholarly import scholarly from OSMPythonTools.nominatim import Nominatim from mpl_toolkits.basemap import Basemap import matplotlib.pyplot as plt from fp.fp import FreeProxy from scholarly import ProxyGenerator from time import sleep pg = ProxyGenerator() proxy = FreeProxy(rand=True, timeout=1, country_id=['BR']).get() pg.SingleProxy(http=proxy, https=proxy) scholarly.use_proxy(pg) def plot_citations(author_name): m = Basemap(projection='mill', lon_0=180) m.drawmapboundary(fill_color='aqua') m.fillcontinents(color='coral', lake_color='aqua') search_query = scholarly.search_author(author_name) author = next(search_query).fill() print(author) for pub in [author.publications[0]]: print('Title: ', pub.bib['title']) pub = pub.fill() sleep(45) for citation in pub.citedby: print(citation) sleep(45) firstAuthorId = None while firstAuthorId is None or len(citation.bib['author_id']) == 0:
#!env python from scholarly import scholarly from scholarly import ProxyGenerator import fileinput import sys pg = ProxyGenerator() pg.Tor_External(9050, 9051, 'password') scholarly.use_proxy(pg) for a in fileinput.input(): if a == "": continue try: search_query = scholarly.search_pubs(a) aa = next(search_query).fill() print(a.rstrip(), end='') bib = aa.bib print("," + str(bib['gsrank']), end='') print("," + str(bib['cites']), end='') print("") except: print(" --- Unexpected error (" + a + "): ", sys.exc_info()[0]) pass
def set_proxy(): if ALLOW_PROXY_ON_SCHOLAR: pg = ProxyGenerator() pg.SingleProxy(http_proxy, https_proxy) scholarly.use_proxy(pg)
async def __call__(self): UserCancel = KeyboardInterrupt # region various embed types creation def publication_embeds(result) -> discord.Embed: embed = discord.Embed( title=result["bib"]["title"], description=result["bib"]["abstract"], url=result["eprint_url"] if "eprint_url" in result.keys() else result["pub_url"], ) embed.add_field( name="Authors", value=", ".join(result["bib"]["author"]).strip(), inline=True, ) embed.add_field(name="Publisher", value=result["bib"]["venue"], inline=True) embed.add_field( name="Publication Year", value=result["bib"]["pub_year"], inline=True ) embed.add_field( name="Cited By", value=result["num_citations"] if "num_citations" in result.keys() else "0", inline=True, ) embed.add_field( name="Related Articles", value=f'https://scholar.google.com{result["url_related_articles"]}', inline=True, ) embed.set_footer(text=f"Requested by {self.ctx.author}") return embed def author_embeds(result) -> discord.Embed: embed = discord.Embed(title=result["name"]) embed.add_field( name="Cited By", value=f"{result['citedby']} articles", inline=True ) embed.add_field(name="Scholar ID", value=result["scholar_id"], inline=True) embed.add_field( name="Affiliation", value=result["affiliation"] if "affiliation" in result.keys() else "None", inline=True, ) embed.add_field( name="Interests", value=f"{', '.join(result['interests']) if 'interests' in result.keys() else 'None'}", inline=True, ) embed.set_image(url=result["url_picture"]) embed.set_footer(text=f"Requested by {self.ctx.author}") return embed def citation_embeds(result) -> discord.Embed: embed = discord.Embed( title=result["bib"]["title"], description=f"```{scholarly.bibtex(result)}```", url=result["eprint_url"] if "eprint_url" in result.keys() else result["pub_url"], ) embed.set_footer(text=f"Requested by {self.ctx.author}") return embed # endregion try: # region user flags processing pg = ProxyGenerator() proxy = FreeProxy(rand=True, timeout=1, country_id=["BR"]).get() pg.SingleProxy(http=proxy, https=proxy) scholarly.use_proxy(pg) # self.args processing if self.args is None: results = [next(scholarly.search_pubs(self.query)) for _ in range(5)] embeds = list(map(publication_embeds, results)) elif "author" in self.args: results = [ next(scholarly.search_author(self.query)) for _ in range(5) ] embeds = list(map(author_embeds, results)) elif "cite" in self.args: results = scholarly.search_pubs(self.query) results = [results for _ in range(5)] embeds = list(map(citation_embeds, results)) else: await self.message.edit(content="Invalid flag") return # endregion # sets the reactions for the search result if len(embeds) > 1: buttons = [[ {Button(style=ButtonStyle.grey, label="◀️", custom_id="◀️"): None}, {Button(style=ButtonStyle.red, label="🗑️", custom_id="🗑️"): None}, {Button(style=ButtonStyle.grey, label="▶️", custom_id="▶️"): None} ]] else: buttons = [[ Button(style=ButtonStyle.red, label="🗑️", custom_id="🗑️") ]] await Sudo.multi_page_system(self.bot, self.ctx, self.message, tuple(embeds), buttons) return except asyncio.TimeoutError: raise except (asyncio.CancelledError, discord.errors.NotFound): pass except scholarly_exceptions._navigator.MaxTriesExceededException: await self.message.edit( content="Google Scholar is currently blocking our requests. Please try again later" ) Log.append_to_log(self.ctx, f"{self.ctx.command} error", "MaxTriesExceededException") return except Exception as e: await error_handler(self.bot, self.ctx, e, self.query) finally: return
from scholarly import scholarly import yaml # something like ssh -D 9050 -q -C -N [email protected] from scholarly import scholarly, ProxyGenerator # default values are shown below proxies = { 'http': 'socks5://127.0.0.1:9050', 'https': 'socks5://127.0.0.1:9050' } pg = ProxyGenerator() pg.SingleProxy(**proxies) scholarly.use_proxy(pg) # Retrieve the author's data, fill-in, and print #author=scholarly.search_author_id('4poYWhEAAAAJ') search_query = scholarly.search_author('Vassil Vassilev') while True: print("Iter") try: author = next(search_query).fill() if 'cern' in author.email: break #print(author) except StopIteration: break #sys.exit(1) print(author) print("Titles")
from collections import defaultdict from scholarly import scholarly, ProxyGenerator from tqdm import tqdm from yattag import Doc, indent # Settings PEOPLE = [ "James O'Shea", "Alex Saywell", "Philip Moriarty", "Peter Beton", "James Sharp" ] OUTPUT_DIR = "D:/Nano Group Page/all_pubs" MIN_YEAR = 1990 # Setup proxy to avoid ignored requests pg = ProxyGenerator() scholarly.use_proxy(pg.FreeProxies()) # Preallocate pubs_by_year = defaultdict( list) # Defaultdict creates entries if no already existing, so can append. pubs = [] # Get all publications in an unordered list for p in PEOPLE: search_query = scholarly.search_author(f'{p}, Nottingham') author = next(search_query) info = scholarly.fill(author, sections=['publications']) pubs.append(info["publications"]) pubs = functools.reduce(operator.iconcat, pubs, [])