def search(self, query: str, generic_cover: str = "", locale: str = "en") -> Optional[List[MetaRecord]]: val = list() if self.active: title_tokens = list( self.get_title_tokens(query, strip_joiners=False)) if title_tokens: tokens = [quote(t.encode("utf-8")) for t in title_tokens] query = " ".join(tokens) try: scholarly.set_timeout(20) scholarly.set_retries(2) scholar_gen = itertools.islice(scholarly.search_pubs(query), 10) except Exception as e: log.warning(e) return None for result in scholar_gen: match = self._parse_search_result(result=result, generic_cover="", locale=locale) val.append(match) return val
def setUp(self): proxy_generator = ProxyGenerator() if "CONNECTION_METHOD" in scholarly.env: self.connection_method = os.getenv("CONNECTION_METHOD") else: self.connection_method = "none" if self.connection_method == "tor": tor_sock_port = None tor_control_port = None tor_password = "******" # Tor uses the 9050 port as the default socks port # on windows 9150 for socks and 9151 for control if sys.platform.startswith("linux") or sys.platform.startswith( "darwin"): tor_sock_port = 9050 tor_control_port = 9051 elif sys.platform.startswith("win"): tor_sock_port = 9150 tor_control_port = 9151 proxy_generator.Tor_External(tor_sock_port, tor_control_port, tor_password) scholarly.use_proxy(proxy_generator) elif self.connection_method == "tor_internal": if sys.platform.startswith("linux"): tor_cmd = 'tor' elif sys.platform.startswith("win"): tor_cmd = 'tor.exe' proxy_generator.Tor_Internal(tor_cmd=tor_cmd) scholarly.use_proxy(proxy_generator) elif self.connection_method == "luminati": scholarly.set_retries(10) proxy_generator.Luminati(usr=os.getenv("USERNAME"), passwd=os.getenv("PASSWORD"), proxy_port=os.getenv("PORT")) scholarly.use_proxy(proxy_generator) elif self.connection_method == "freeproxy": proxy_generator.FreeProxies() scholarly.use_proxy(proxy_generator) else: scholarly.use_proxy(None)
# """ "Regulated+Domain" """, # """ "FDA+Requirement" """, # """ "Continuous+Delivery" """, # """ "Continuous+Integration" """, # """ "Automation+Systems" """, # """ "Software+Validation" """, # """ "Continuous+Software+Engineering" """ # ] # Setup Scholarly to crawl Scholar with Tor Proxy pg = ProxyGenerator() pg.Tor_External(tor_sock_port=9050, tor_control_port=9051, tor_password="******") scholarly.use_proxy(pg) scholarly.set_retries(100) # Creates custom query and retrives the iterator from Scholarly def CreateSearchQuery(keyword): # Parameters: # hl: eng # lang: eng # as_vis: 0 # as_sdt: 0,33 # q: keyword query = f"""/scholar?hl=en&lr=lang_en&as_vis=0&as_sdt=0,33&q={keyword.strip()}""" search_query = scholarly.search_pubs_custom_url(query) return search_query