def setUp(self): proxy_generator = ProxyGenerator() if "CONNECTION_METHOD" in scholarly.env: self.connection_method = os.getenv("CONNECTION_METHOD") else: self.connection_method = "none" if self.connection_method == "tor": tor_sock_port = None tor_control_port = None tor_password = "******" # Tor uses the 9050 port as the default socks port # on windows 9150 for socks and 9151 for control if sys.platform.startswith("linux") or sys.platform.startswith( "darwin"): tor_sock_port = 9050 tor_control_port = 9051 elif sys.platform.startswith("win"): tor_sock_port = 9150 tor_control_port = 9151 proxy_generator.Tor_External(tor_sock_port, tor_control_port, tor_password) scholarly.use_proxy(proxy_generator) elif self.connection_method == "tor_internal": if sys.platform.startswith("linux"): tor_cmd = 'tor' elif sys.platform.startswith("win"): tor_cmd = 'tor.exe' proxy_generator.Tor_Internal(tor_cmd=tor_cmd) scholarly.use_proxy(proxy_generator) elif self.connection_method == "luminati": scholarly.set_retries(10) proxy_generator.Luminati(usr=os.getenv("USERNAME"), passwd=os.getenv("PASSWORD"), proxy_port=os.getenv("PORT")) scholarly.use_proxy(proxy_generator) elif self.connection_method == "freeproxy": proxy_generator.FreeProxies() scholarly.use_proxy(proxy_generator) else: scholarly.use_proxy(None)
def test_tor_launch_own_process(self): """ Test that we can launch a Tor process """ proxy_generator = ProxyGenerator() if sys.platform.startswith("linux"): tor_cmd = 'tor' elif sys.platform.startswith("win"): tor_cmd = 'tor.exe' tor_sock_port = random.randrange(9000, 9500) tor_control_port = random.randrange(9500, 9999) result = proxy_generator.Tor_Internal(tor_cmd, tor_sock_port, tor_control_port) self.assertTrue(result["proxy_works"]) self.assertTrue(result["refresh_works"]) self.assertEqual(result["tor_control_port"], tor_control_port) self.assertEqual(result["tor_sock_port"], tor_sock_port) # Check that we can issue a query as well query = 'Ipeirotis' scholarly.use_proxy(proxy_generator) authors = [a for a in scholarly.search_author(query)] self.assertGreaterEqual(len(authors), 1)
continue return file_list def get_ids(): files_list = get_articles_files_list(ARTICLES_INPUT_FOLDER) print(files_list) for file in files_list: get_author_ids_for_file(file) break print("Started connection to tor !") pg = ProxyGenerator() pg.Tor_Internal(tor_cmd='tor') scholarly.use_proxy(pg) print("Connection to tor done successfully !") get_author_ids_for_file('articles3_copy.csv') # get_ids() """ ### the following seciton adds the missing columns to the publications csv files """ def add_columns_to_publications(): files_list = get_articles_files_list(ARTICLES_INPUT_FOLDER) for file in files_list: file_path = os.path.join(ARTICLES_INPUT_FOLDER, file)
import arxiv import os import glob from googlesearch import search import tarfile from scholarly import scholarly, ProxyGenerator from functools import lru_cache import re import time import random paper_download_dir = './papers' pg = ProxyGenerator() pg.Tor_Internal(tor_cmd="tor") scholarly.use_proxy(pg) class color: PURPLE = '\033[95m' CYAN = '\033[96m' DARKCYAN = '\033[36m' BLUE = '\033[94m' GREEN = '\033[92m' YELLOW = '\033[93m' RED = '\033[91m' BOLD = '\033[1m' UNDERLINE = '\033[4m' END = '\033[0m'
from scholarly import scholarly, ProxyGenerator import json from dotenv import load_dotenv from pathlib import Path proxy_generator = ProxyGenerator() proxy_generator.Tor_Internal(tor_cmd='tor') scholarly.use_proxy(proxy_generator) search_query = scholarly.search_author('Maël Montévil') author = scholarly.fill(next(search_query)) pubs = [ scholarly.fill(pub) for pub in author['publications'] if (pub['num_citations'] > 0) ] pubs2 = [] for pub in pubs: if 'citedby_url' in pub: pubs2 = [pubs2, [pub, (list(scholarly.citedby(pub)))]] print(json.dumps(pubs2, indent=2, default=lambda o: '<not serializable>'))