def check_ip(ip_info, port_info, type): check_url = "https://bck.hermes.com/product-page?locale=us_en&productsku=H056289CC18" ip_url = "%s://%s:%s" % (type, ip_info, port_info) manager = ProxyManager(ip_url, timeout=10, cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) headers = util.make_headers( accept_encoding='gzip, deflate', keep_alive=True, user_agent= "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0" ) headers['Accept-Language'] = "en-US,en;q=0.5" headers['Connection'] = 'keep-alive' headers[ 'Accept'] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" try: response = manager.request('GET', check_url, preload_content=False, headers=headers) res = response.data print(res) json.loads(res) return True except Exception as ex: return False
def configure_http_pool(): global gl_http_pool if gl_args.mode == 'auto-scan' or gl_args.mode == 'file-scan': timeout = Timeout(connect=1.0, read=3.0) else: timeout = Timeout(connect=gl_args.timeout, read=6.0) if gl_args.proxy: # when using proxy, protocol should be informed if 'http' not in gl_args.host or 'http' not in gl_args.proxy: print_and_flush(RED + " * When using proxy, you must specify the http or https protocol" " (eg. http://%s).\n\n" %(gl_args.host if 'http' not in gl_args.host else gl_args.proxy) +ENDC) logging.critical('Protocol not specified') exit(1) try: if gl_args.proxy_cred: headers = make_headers(proxy_basic_auth=gl_args.proxy_cred) gl_http_pool = ProxyManager(proxy_url=gl_args.proxy, proxy_headers=headers, timeout=timeout, cert_reqs='CERT_NONE') else: gl_http_pool = ProxyManager(proxy_url=gl_args.proxy, timeout=timeout, cert_reqs='CERT_NONE') except: print_and_flush(RED + " * An error occurred while setting the proxy. Please see log for details..\n\n" +ENDC) logging.critical('Error while setting the proxy', exc_info=traceback) exit(1) else: gl_http_pool = PoolManager(timeout=timeout, cert_reqs='CERT_NONE')
def get_web_page(url: str, proxies: list = None): headers = { 'User-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582", } if proxies: import numpy as np status = 0 i = 0 while status != 220 and i < 10: try: proxy_url = np.random.choice(proxies) print(f"Proxy: {proxy_url}") proxy_url = "https://" + url.strip() http = ProxyManager(proxy_url=proxy_url, headers=headers, cert_reqs='CERT_NONE', assert_hostname=False) resp = http.request('GET', url) status = resp.status print(status) except: i += 1 else: http = PoolManager(headers=headers, cert_reqs='CERT_NONE', assert_hostname=False) resp = http.request('GET', url) return resp.status, resp.data.decode('utf-8')
class NCBI_Authetication(): def __init__(self): self.authenticate() def authenticate(self): self.base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/" self.my_query = "PD-1%20ab%20agonist%5BTitle%2FAbstract%5D)%20AND%20(%222000%2F01%2F01%22%5BDate%20-%20Publication%5D%20%3A%20%223000%22%5BDate%20-%20Publication%5D" self.database = "pubmed" self.second_url = "esearch.fcgi?db={db}&term={query}&usehistory=y" self.final_url = self.base_url + self.second_url.format( db=self.database, query=self.my_query) self.http = ProxyManager("http://proxy.gtm.lilly.com:9000/") self.response = self.http.request('GET', self.final_url) self.http = ProxyManager("http://proxy.gtm.lilly.com:9000/") self.firstResponse = self.http.request('GET', self.final_url) self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" self.my_query = "id=29554659" self.database = "pubmed" self.second_url = "elink.fcgi?dbfrom=gene&db={db}&{query}" self.final_url = self.base_url + self.second_url.format( db=self.database, query=self.my_query) self.http = ProxyManager("http://proxy.gtm.lilly.com:9000/") self.secondResponse = self.http.request('GET', self.final_url) def get_response(self): return self.firstResponse, self.secondResponse
def __init__(self): user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7' self.headers = {'User-Agent': user_agent} self.ip_url = 'http://icanhazip.com/' self.logger = logging.getLogger('gkp') retries = Retry(connect=5, read=5, redirect=5) self.agent = ProxyManager('http://localhost:8118/', retries=retries, timeout=Timeout(total=60.0))
def request(self): QtWidgets.qApp.processEvents() self.proxi() print(self.stroka2) self.prm = ProxyManager(str(self.stroka2)) print(self.stroka2) try: QtWidgets.qApp.processEvents() r = self.prm.request('GET', 'https://www.yandex.ru/') except: return False return True
def request(self): req = self._request if req.proxy: if req.proxy_userpwd: headers = make_headers(proxy_basic_auth=req.proxy_userpwd) else: headers = None proxy_url = '%s://%s' % (req.proxy_type, req.proxy) try: pool = ProxyManager(proxy_url, proxy_headers=headers) except ProxySchemeUnknown: raise GrabMisuseError('Urllib3 transport does ' 'not support %s proxies' % req.proxy_type) else: pool = self.pool try: retry = Retry(redirect=False, connect=False, read=False) # The read timeout is not total response time timeout # It is the timeout on read of next data chunk from the server # Total response timeout is handled by Grab timeout = Timeout(connect=req.connect_timeout, read=req.timeout) #req_headers = dict((make_unicode(x), make_unicode(y)) # for (x, y) in req.headers.items()) if six.PY3: req_url = make_unicode(req.url) req_method = make_unicode(req.method) else: req_url = make_str(req.url) req_method = req.method req.op_started = time.time() res = pool.urlopen(req_method, req_url, body=req.data, timeout=timeout, retries=retry, headers=req.headers, preload_content=False) except exceptions.ReadTimeoutError as ex: raise error.GrabTimeoutError('Read timeout') except exceptions.ConnectTimeoutError as ex: raise error.GrabConnectionError('Could not create connection') except exceptions.ProtocolError as ex: raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1]) # WTF? self.request_head = b'' self.request_body = b'' self.request_log = b'' self._response = res
def run_proxy(url): global lock #print(lock.locked()) if(prox == ''): print('No proxys available.') return run(url) print('Proxy: ' + prox) http = ProxyManager(prox) try: data = {'attribute': 'value'} encoded_data = json.dumps(data).encode('utf-8') req = http.request( 'POST', url, timeout = 3, body=encoded_data, headers={'Content-Type': 'html/text'}) print(req.status) if(req.status == 404): print('Item Does not exist.') #return run(url) return if(req.status == 501): print('Proxy at api call limit') get_new_proxy() return run_proxy(url) if(req.status == 407): print('Authentication required') get_new_proxy() return run_proxy(url) if(req.status != 200): print('Unknown Status Code') print(req.status) get_new_proxy() return run_proxy(url) except: print('Request timed out.') get_new_proxy() return run(url) data = json.loads(req.data) req.release_conn() data = data['item'] id = str(data['id']) print('ID: ' + id) file = open('ItemIds','a') file.write(id + '\n') file.close()
def get_pool(self, req): if req['proxy']: if req['proxy_auth']: proxy_headers = make_headers( proxy_basic_auth=req['proxy_auth']) else: proxy_headers = None proxy_url = '%s://%s' % (req['proxy_type'], req['proxy']) pool_key = (req['proxy_type'], req['proxy'], bool(req['verify'])) if pool_key not in self.pools: if req['proxy_type'] == 'socks5': opts = { #num_pools=1000, #maxsize=10, } if req['verify']: pool = SOCKSProxyManager(proxy_url, cert_reqs='CERT_REQUIRED', ca_certs=certifi.where(), **opts) else: pool = SOCKSProxyManager(proxy_url, **opts) elif req['proxy_type'] == 'http': opts = { #num_pools=1000, #maxsize=10, } if req['verify']: pool = ProxyManager( proxy_url, proxy_headers=proxy_headers, cert_reqs='CERT_REQUIRED', ca_certs=certifi.where(), **opts, ) else: pool = ProxyManager(proxy_url, proxy_headers=proxy_headers, **opts) else: raise IowebConfigError( 'Invalid value of request option `proxy_type`: %s' % req['proxy_type']) self.pools[pool_key] = pool else: pool = self.pools[pool_key] else: pool = self.pools[(None, None, bool(req['verify']))] return pool
def __proxy_pool(self): """ Create Proxy connection pool :raise ProxyRequestError :return: urllib3.HTTPConnectionPool """ try: self.__server = self.__cfg.proxy if True is self.__cfg.is_standalone_proxy else self.__get_random_proxy() if self.__get_proxy_type(self.__server) == 'socks': disable_warnings(InsecureRequestWarning) if not hasattr(self, '__pm'): package_module = importlib.import_module('urllib3.contrib.socks') self.__pm = getattr(package_module, 'SOCKSProxyManager') pool = self.__pm(self.__server, num_pools=self.__cfg.threads, timeout=Timeout(self.__cfg.timeout, read=self.__cfg.timeout), block=True) else: pool = ProxyManager(self.__server, num_pools=self.__cfg.threads, timeout=Timeout(self.__cfg.timeout, read=self.__cfg.timeout), block=True) return pool except (DependencyWarning, ProxySchemeUnknown, ImportError) as error: raise ProxyRequestError(error)
def check_stock_proxy_manager(url, proxy=None, count=0): if proxy is None: manager = PoolManager(timeout=5, cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) else: proxy_url = "%s://%s:%s" % (proxy[0], proxy[1], proxy[2]) manager = ProxyManager(proxy_url, timeout=5, cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) headers = util.make_headers(accept_encoding='gzip, deflate', keep_alive=True, user_agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0") headers['Accept-Language'] = "en-US,en;q=0.5" headers['Connection'] = 'keep-alive' headers['Accept'] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" try: response = manager.request('GET', url, preload_content=False, headers=headers) content = json.loads(response.data) print("%s - Connect Success!" % count) return content['hasStock'] except Exception as ex: print("%s - Connect Error!" % count) return False
def __init__(self): user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7' self.headers = {'User-Agent': user_agent} self.ip_url = 'http://icanhazip.com/' retries = Retry(connect=5, read=25, redirect=5) self.agent = ProxyManager( 'http://localhost:8118/', retries=retries, timeout=Timeout(total=60.0))
def _init_connection(self): """Function for initiating connection with remote server""" cert_reqs = 'CERT_NONE' if self._connection_properties.get('ca_cert_data'): LOGGER.info('Using CA cert to confirm identity.') cert_reqs = 'CERT_REQUIRED' self._connection_properties.update( self._connection_properties.pop('ca_cert_data')) if self.proxy: if self.proxy.startswith('socks'): LOGGER.info("Initializing a SOCKS proxy.") http = SOCKSProxyManager(self.proxy, cert_reqs=cert_reqs, maxsize=6, \ **self._connection_properties) else: LOGGER.info("Initializing a HTTP proxy.") http = ProxyManager(self.proxy, cert_reqs=cert_reqs, maxsize=6, \ **self._connection_properties) else: LOGGER.info("Initializing no proxy.") try: self._connection_properties.pop('ca_cert_data') except KeyError: pass http = PoolManager(cert_reqs=cert_reqs, maxsize=6, **self._connection_properties) self._conn = http.request
def _scrape_market(self, app_id): scrape_url = APPLE_APP_URL.format(app_id=app_id) header = {'content-type': 'text/html', 'User-Agent': user_agents[random.randint(0, len(user_agents)-1)]} try: response = self.connection_pool.request('GET', scrape_url, timeout=60, retries=2, headers=header) if response: content = response.data if len(content) > REJECT_PAGE_SIZE: if len(content) > NORMAL_APP_PAGE_SIZE: self.proxy_service.manage(self.proxy, False) print 'Succeed scrape app', app_id logger.info('Succeed scrape app {}'.format(app_id)) return content else: print 'Invalid app', app_id logger.info('Invalid app {}'.format(app_id)) else: logger.info('Reject visit app {}, use proxy {}'.format(app_id, self.proxy)) raise Exception('Reject visit app {}'.format(app_id)) else: raise Exception('Response is None') except Exception as ex: self.proxy_service.manage(self.proxy, True) self.proxy = self.proxy_service.get_proxy('https') self.connection_pool = ProxyManager(self.proxy['https']) if self.proxy else PoolManager() raise ex
def get_pool(self, req, use_cache=True): if req['proxy']: if req['proxy_type'] == 'socks5' and req['proxy_auth']: proxy_url = '%s://%s@%s' % (req['proxy_type'], req['proxy_auth'], req['proxy']) else: proxy_url = '%s://%s' % (req['proxy_type'], req['proxy']) pool_key = (req['proxy_type'], req['proxy'], bool(req['verify'])) if not use_cache or pool_key not in self.pools: if req['proxy_type'] == 'socks5': if req['verify']: pool = SOCKSProxyManager( proxy_url, cert_reqs='CERT_REQUIRED', ca_certs=certifi.where(), ) else: pool = SOCKSProxyManager(proxy_url) elif req['proxy_type'] == 'http': if req['proxy_auth']: proxy_headers = make_headers( proxy_basic_auth=req['proxy_auth']) else: proxy_headers = None if req['verify']: pool = ProxyManager( proxy_url, proxy_headers=proxy_headers, cert_reqs='CERT_REQUIRED', ca_certs=certifi.where(), ) else: pool = ProxyManager( proxy_url, proxy_headers=proxy_headers, ) else: raise error.IowebConfigError( 'Invalid value of request option `proxy_type`: %s' % req['proxy_type']) if use_cache: self.pools[pool_key] = pool else: pool = self.pools[pool_key] else: pool = self.pools[(None, None, bool(req['verify']))] return pool
def get_uids(self, term): base_url = "https://www.ncbi.nlm.nih.gov/medgen/?term=" term = term.replace(" ", "+") final_url = base_url + term http = urllib3.PoolManager() http = ProxyManager("http://proxy.gtm.lilly.com:9000/") response = http.request('GET', final_url) soup = BeautifulSoup(response.data, 'lxml') pattern = "<dd>[0-9]*</dd>" p = re.compile(pattern) ids = p.findall(str(soup)) ids = [ id.replace("<dd>", "").replace("</dd>", "").strip() for id in ids ] return ids
def request(self): req = self._request if req.proxy: if req.proxy_userpwd: headers = make_headers(proxy_basic_auth=req.proxy_userpwd) else: headers = None proxy_url = '%s://%s' % (req.proxy_type, req.proxy) try: pool = ProxyManager(proxy_url, proxy_headers=headers) except ProxySchemeUnknown: raise GrabMisuseError('Urllib3 transport does ' 'not support %s proxies' % req.proxy_type) else: pool = self.pool try: retry = Retry(redirect=False, connect=False, read=False) timeout = Timeout(connect=req.connect_timeout, read=req.timeout) #req_headers = dict((make_unicode(x), make_unicode(y)) # for (x, y) in req.headers.items()) if six.PY3: req_url = make_unicode(req.url) req_method = make_unicode(req.method) else: req_url = make_str(req.url) req_method = req.method res = pool.urlopen(req_method, req_url, body=req.data, timeout=timeout, retries=retry, headers=req.headers, preload_content=False) except exceptions.ConnectTimeoutError as ex: raise error.GrabConnectionError('Could not create connection') except exceptions.ProtocolError as ex: raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1]) # WTF? self.request_head = '' self.request_body = '' self.request_log = '' self._response = res
def request(self): req = self._request if req.proxy: if req.proxy_userpwd: auth = "%s@" % req.proxy_userpwd else: auth = "" proxy_url = "%s://%s%s" % (req.proxy_type, auth, req.proxy) pool = ProxyManager(proxy_url) else: pool = self.pool try: retry = Retry(redirect=False, connect=False, read=False) timeout = Timeout(connect=req.connect_timeout, read=req.timeout) # req_headers = dict((make_unicode(x), make_unicode(y)) # for (x, y) in req.headers.items()) if six.PY3: req_url = make_unicode(req.url) req_method = make_unicode(req.method) else: req_url = make_str(req.url) req_method = req.method res = pool.urlopen( req_method, req_url, body=req.data, timeout=timeout, retries=retry, headers=req.headers, preload_content=False, ) except exceptions.ConnectTimeoutError as ex: raise error.GrabConnectionError("Could not create connection") except exceptions.ProtocolError as ex: raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1]) # WTF? self.request_head = "" self.request_body = "" self.request_log = "" self._response = res
def _check(self, protocol, proxy_url_set): valid_proxy_url_set = set() for url in proxy_url_set: header = {'content-type': 'text/html', 'User-Agent': user_agents[random.randint(0, len(user_agents)-1)]} proxy = {protocol: url} conection_pool = ProxyManager(url) try: response = conection_pool.request('GET', CHECK_URL[protocol], timeout=60, headers=header) if response.status == 200: valid_proxy_url_set.add(url) print 'Valid proxy url', url else: print 'Invalid ', url except Exception as ex: print ex print 'Invalid ', url return valid_proxy_url_set
def request(self): req = self._request if req.proxy: if req.proxy_userpwd: headers = make_headers(proxy_basic_auth=req.proxy_userpwd) else: headers = None proxy_url = '%s://%s' % (req.proxy_type, req.proxy) if req.proxy_type == 'socks5': pool = SOCKSProxyManager(proxy_url) # , proxy_headers=headers) else: pool = ProxyManager(proxy_url, proxy_headers=headers) else: pool = self.pool try: retry = Retry(redirect=False, connect=False, read=False) # The read timeout is not total response time timeout # It is the timeout on read of next data chunk from the server # Total response timeout is handled by Grab timeout = Timeout(connect=req.connect_timeout, read=req.timeout) #req_headers = dict((make_unicode(x), make_unicode(y)) # for (x, y) in req.headers.items()) if six.PY3: req_url = make_unicode(req.url) req_method = make_unicode(req.method) else: req_url = make_str(req.url) req_method = req.method req.op_started = time.time() res = pool.urlopen(req_method, req_url, body=req.data, timeout=timeout, retries=retry, headers=req.headers, preload_content=False) except exceptions.ReadTimeoutError as ex: raise error.GrabTimeoutError('ReadTimeoutError', ex) except exceptions.ConnectTimeoutError as ex: raise error.GrabConnectionError('ConnectTimeoutError', ex) except exceptions.ProtocolError as ex: # TODO: # the code # raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1]) # fails # with error TypeError: 'OSError' object is not subscriptable raise error.GrabConnectionError('ProtocolError', ex) # WTF? self.request_head = b'' self.request_body = b'' self.request_log = b'' self._response = res
class TorUtility(): def __init__(self): user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7' self.headers = {'User-Agent': user_agent} self.ip_url = 'http://icanhazip.com/' retries = Retry(connect=5, read=25, redirect=5) self.agent = ProxyManager( 'http://localhost:8118/', retries=retries, timeout=Timeout(total=60.0)) def renewTorIdentity(self, passAuth): try: s = socket.socket() s.connect(('localhost', 9051)) s.send('AUTHENTICATE "{0}"\r\n'.format(passAuth)) resp = s.recv(1024) if resp.startswith('250'): s.send("signal NEWNYM\r\n") resp = s.recv(1024) if resp.startswith('250'): logger.info("Identity renewed") else: logger.info("response 2:%s" % resp) else: logger.info("response 1:%s" % resp) except Exception as e: logger.error("Can't renew identity: %s" % e) def renew_connection(self): with Controller.from_port(port=9051) as controller: controller.authenticate('natalie') controller.signal(Signal.NEWNYM) logger.info('*' * 50) logger.info('\t' * 6 + 'Renew TOR IP: %s' % self.request(self.ip_url)) logger.info('*' * 50) def request(self, url): r = self.agent.request('GET', url) if r.status == 200: return r.data elif r.status == 403: self.renew_connection() else: logger.error('status %s' % r.status) return '' def current_ip(self): return self.request(self.ip_url)
def get_uids(self, term): # Base Query and More Proxy Management # base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" term = self.preprocess(term).replace(" ", "+") second_url = "esearch.fcgi?db={db}&term={query}&retmax=100&format=json" final_url = base_url + second_url.format(db=self.ontology, query=term) http = urllib3.PoolManager() http = ProxyManager("http://proxy.gtm.lilly.com:9000/") t.sleep(1) # Response data # response = http.request('GET', final_url) json_data = json.loads(response.data) # Updates number of search results # self.get_counts(int(json_data['esearchresult']['count'])) # Returns ID List # return json_data['esearchresult']['idlist']
def request(self): req = self._request if req.proxy: if req.proxy_userpwd: auth = '%s@' % req.proxy_userpwd else: auth = '' proxy_url = '%s://%s%s' % (req.proxy_type, auth, req.proxy) pool = ProxyManager(proxy_url) else: pool = self.pool try: retry = Retry(redirect=False, connect=False, read=False) timeout = Timeout(connect=req.connect_timeout, read=req.timeout) #req_headers = dict((make_unicode(x), make_unicode(y)) # for (x, y) in req.headers.items()) if six.PY3: req_url = make_unicode(req.url) req_method = make_unicode(req.method) else: req_url = make_str(req.url) req_method = req.method res = pool.urlopen(req_method, req_url, body=req.data, timeout=timeout, retries=retry, headers=req.headers, preload_content=False) except exceptions.ConnectTimeoutError as ex: raise error.GrabConnectionError('Could not create connection') except exceptions.ProtocolError as ex: raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1]) # WTF? self.request_head = '' self.request_body = '' self.request_log = '' self._response = res
def __init_connection(self, url=None, proxy=False): """Function for initiating connection with remote server :param url: The URL of the remote system :type url: str """ self.__url = url if url else self.__url http = ProxyManager(self.get_proxy(), cert_reqs='CERT_NONE') if self.get_proxy()\ and proxy else urllib3.PoolManager(cert_reqs='CERT_NONE') self._conn = http.request
def get_internet_item(url, html=True): """ get html or data from given url :param url: target site url string :param html: download html or data boolean :return: html string """ if PROXY_MODE == "http_proxy": http = ProxyManager(proxy_url=PROXY_URL_PORT) elif PROXY_MODE == "auth_proxy": auth_proxy_headers = make_headers(proxy_basic_auth=PROXY_BASIC_AUTH) http = ProxyManager(proxy_url=PROXY_URL_PORT, proxy_headers=auth_proxy_headers, cert_reqs="CERT_REQUIRED", ca_certs=certifi.where()) else: http = PoolManager(cert_reqs="CERT_REQUIRED", ca_certs=certifi.where()) r = http.request("GET", url) if r.status != 200: raise ConnectionError("http request failure") if html: data = r.data.decode() else: data = r.data return data
class TorUtility(): def __init__(self): user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7' self.headers = {'User-Agent': user_agent} self.ip_url = 'http://icanhazip.com/' self.logger = logging.getLogger('gkp') retries = Retry(connect=5, read=5, redirect=5) self.agent = ProxyManager('http://localhost:8118/', retries=retries, timeout=Timeout(total=60.0)) def renewTorIdentity(self, passAuth): try: s = socket.socket() s.connect(('localhost', 9051)) s.send('AUTHENTICATE "{0}"\r\n'.format(passAuth)) resp = s.recv(1024) if resp.startswith('250'): s.send("signal NEWNYM\r\n") resp = s.recv(1024) if resp.startswith('250'): self.logger.info("Identity renewed") else: self.logger.info("response 2:%s" % resp) else: self.logger.info("response 1:%s" % resp) except Exception as e: self.logger.error("Can't renew identity: %s" % e) def renew_connection(self): with Controller.from_port(port=9051) as controller: controller.authenticate('natalie') controller.signal(Signal.NEWNYM) self.logger.info('*' * 50) self.logger.info('\t' * 6 + 'Renew TOR IP: %s' % self.request(self.ip_url)) self.logger.info('*' * 50) def request(self, url): r = self.agent.request('GET', url) if r.status == 200: return r.data elif r.status == 403: self.renew_connection() else: self.logger.error('status %s' % r.status) return '' def current_ip(self): return self.request(self.ip_url)
def get_http_connector(conf, options): """ Used to create http connector, depends on api_proxy configuration parameter :param conf: configuration object :param options: additional options :return: ProxyManager if api_proxy is set, otherwise PoolManager object """ if conf.api_proxy: return ProxyManager(conf.api_proxy, **options) else: return PoolManager(**options)
def urllib3_from_pool(self, http_request): """ Get a u3 pool from url and request :param http_request: HttpRequest :type http_request: HttpRequest :return urllib3.poolmanager.ProxyManager :rtype urllib3.poolmanager.ProxyManager """ if not http_request.http_proxy_host: SolBase.sleep(0) return self._u3_basic_pool # Compute key key = "{0}#{1}#".format( http_request.http_proxy_host, http_request.http_proxy_port, ) # Check if key in self._u3_proxy_pool: SolBase.sleep(0) return self._u3_proxy_pool[key] # Allocate (in lock) with self._u3_proxy_locker: # Check maxed if len(self._u3_proxy_pool) >= self._u3_proxy_pool_max: raise Exception("u3 pool maxed, cur={0}, max={1}".format( len(self._u3_proxy_pool), self._u3_proxy_pool_max )) # Uri proxy_url = "http://{0}:{1}".format( http_request.http_proxy_host, http_request.http_proxy_port) # Ok, allocate # Force underlying fifo queue to 1024 via maxsize p = ProxyManager(num_pools=1024, maxsize=1024, proxy_url=proxy_url) self._u3_proxy_pool[key] = p logger.info("Started new pool for key=%s", key) SolBase.sleep(0) return p
class Downloader: def __init__(self, proxy_list): self.__proxyCounter = 0 self.__proxyList = proxy_list self.__http = ProxyManager("http://" + self.__proxyList[self.__proxyCounter]) def try_download(self, url, tries=0): try: r = self.__http.request('GET', url) except: if tries > 2: print("To many tries, updating proxy...") self.update_proxy() r = self.try_download(url) else: print("Error while downloading from \'%s\'. Trying again in 3 secs... [%d]" % (url, tries + 1)) time.sleep(3) r = self.try_download(url, tries + 1) return r def update_proxy(self): self.__proxyCounter += 1 if self.__proxyCounter >= len(proxyList): self.__proxyCounter = 0 self.__http = ProxyManager("http://" + self.__proxyList[self.__proxyCounter]) def download_to_file(self, url, file_adress, tries=0): print("Start downloading from: '{0}'".format(url)) r = self.try_download(url) if r.status == 200: print("Downloaded. Saving to '{0}'".format(file_adress)) f = open(file_adress, 'wb') f.write(r.data) f.close() elif r.status // 100 == 5: print("Something wrong with server (%s). Waiting 2 secs and trying again... [%d]" % (r.status, tries + 1)) time.sleep(2) if tries < 5: self.download_to_file(url, file_adress, tries + 1) else: print("Too many tries. Aborting! Try to start update later") return -1 else: print("Wrong response status: {0}".format(r.status))
def __init_connection(self, url=None, proxy=False): """Function for initiating connection with remote server :param url: The URL of the remote system :type url: str """ self.__url = url if url else self.__url if self.get_proxy() and proxy: if self.get_proxy().startswith('socks'): LOGGER.info("Initializing a SOCKS proxy.") http = SOCKSProxyManager(self.get_proxy(), cert_reqs='CERT_NONE') else: LOGGER.info("Initializing a HTTP proxy.") http = ProxyManager(self.get_proxy(), cert_reqs='CERT_NONE') else: LOGGER.info("Initializing no proxy.") http = urllib3.PoolManager(cert_reqs='CERT_NONE') self._conn = http.request
def __init__(self, configuration: Configuration, pools_size: int = 4): self.configuration = configuration self.header_params: dict = {} self.user_agent = 'felix-scholz/website-python-client/1.0.1/python' addition_pool_args = {} if configuration.assert_hostname is not None: addition_pool_args[ 'assert_hostname'] = configuration.assert_hostname if configuration.retries is not None: addition_pool_args['retries'] = configuration.retries if configuration.proxy: self.pool_manager = ProxyManager( num_pools=pools_size, maxsize=configuration.connection_pool_maxsize if not None else 4, cert_reqs=ssl.CERT_REQUIRED if configuration.verify_ssl else ssl.CERT_NONE, ca_certs=configuration.ssl_ca_cert if configuration.ssl_ca_cert is not None else certifi.where(), cert_file=configuration.cert_file, key_file=configuration.key_file, proxy_url=configuration.proxy, proxy_headers=configuration.proxy_headers, **addition_pool_args) else: self.pool_manager = PoolManager( num_pools=pools_size, maxsize=configuration.connection_pool_maxsize if not None else 4, cert_reqs=ssl.CERT_REQUIRED if configuration.verify_ssl else ssl.CERT_NONE, ca_certs=configuration.ssl_ca_cert if configuration.ssl_ca_cert is not None else certifi.where(), cert_file=configuration.cert_file, key_file=configuration.key_file, **addition_pool_args)
def fetch_title(proxy: urllib3.ProxyManager, title_id) -> dict: url = f"https://mangadex.org/api/v2/manga/{title_id}" scrape_id = db.run_sql( "INSERT INTO scrape (proxy, url) VALUES (?, ?)", (proxy.proxy_url, url), return_last_insert_rowid=True, ) resp = proxy.request("GET", url) assert resp.status in [200, 404], resp.data db.run_sql( """ UPDATE scrape SET resp_status = ?, resp_body = ?, ended_at = datetime('now') WHERE id = ?; """, (resp.status, resp.data, scrape_id), ) print("Saved title", title_id, "-", resp.status)
def _init_connection(self): """Function for initiating connection with remote server""" if self._connection_properties.get('ca_certs', None): LOGGER.info('Using CA cert to confirm identity.') cert_reqs = 'CERT_REQUIRED' else: LOGGER.info('Not using CA certificate.') cert_reqs = 'CERT_NONE' if self.proxy: if self.proxy.startswith('socks'): LOGGER.info("Initializing a SOCKS proxy.") http = SOCKSProxyManager(self.proxy, cert_reqs=cert_reqs, maxsize=6, \ **self._connection_properties) else: LOGGER.info("Initializing a HTTP proxy.") http = ProxyManager(self.proxy, cert_reqs=cert_reqs, maxsize=6, \ **self._connection_properties) else: LOGGER.info("Initializing no proxy.") http = PoolManager(cert_reqs=cert_reqs, maxsize=6, **self._connection_properties) self._conn = http.request
def __init__(self): user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7' self.headers={'User-Agent':user_agent} self.ip_url = 'http://icanhazip.com/' self.logger = logging.getLogger('gkp') self.http = ProxyManager('http://localhost:8118/', retries=retries, timeout=Timeout(total=60.0))
def get_terms(self, term, id, id_string, number_of_results, is_match=False): # Make API call to get xml data # term = self.lemmatize(self.preprocess(term)) # Proxy Code and Base Query # base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" second_url = "esummary.fcgi?db=medgen&db=medgen&{query}" final_url = base_url + second_url.format(db=self.ontology, query="id=" + id_string) http = urllib3.PoolManager() http = ProxyManager("http://proxy.gtm.lilly.com:9000/") t.sleep(1) response = http.request('GET', final_url) soup = BeautifulSoup(response.data, 'lxml') # Get the separate hits in lists # hits = soup.find_all('documentsummary') # Dictionary to store the results # results = [] # Set threshold, take the min of the threshold requested and the total number of search results # threshold = min(self.threshold, number_of_results) # For every hit (each hit represents data from ONE UID) # for hit in hits: # Keeps track of meeting the threshold # counter = 0 # Check if return is a disease # check = "Blank" if not len(hit.find("semanticid")) else hit.find( "semanticid").text.strip() # List of acceptable semantic types # semantic_types = [ 'T191', 'T047', 'T048', 'T019', 'T190', 'T033', 'T049', 'T046', 'T184', "Blank" ] # If term is a disease, execute the following: # if check in semantic_types: # Get Concept ID # concept_id = "Blank" if not len(hit.find( 'conceptid')) else hit.find('conceptid').text.strip() # Get Title # title = hit.find('title').text.strip() # Get name tags for looping # name_tags = hit.find_all('name') # Get definition/description # definition = hit.find('definition').text.strip() def_score = self.modified_jaccard_similarity(term, definition) # Get SAB, CODE, SCUI, SDUI, and Title # processed_term = self.stem(term) new_title = self.stem(self.lemmatize(self.preprocess(title))) # Keeps track of best scores for each uid # scores = [] # Loop through synonyms # for data in name_tags: # Get the max syn_score between a synonym and the title # new_text = self.stem( self.lemmatize(self.preprocess(data.text))) syn_score = max(fuzz.ratio(new_text, processed_term), fuzz.ratio(processed_term, new_title)) syn_score = max( fuzz.ratio(new_text, processed_term), fuzz.ratio(processed_term, new_title) ) if len(new_text.split()) == 1 and len( new_title.split()) == 1 and len(processed_term.split( )) == 1 else self.jaccard_similarity( new_text, processed_term) # If score is 100 or the term is one word, take the syn_score # score = syn_score if len( term.split()) == 1 or syn_score == 100 else max( syn_score, def_score) # Intialize dictionary to add to results # value = dict() code, sab, scui, sdui = None, None, None, None index = hits.index(hit) # Add Basic Data MetaData to Dictionary # value['Disease_Input'] = term value['Ontology'] = self.ontology value['Synonym'] = data.text value['Description'] = definition value['Semantic_Type'] = check value['UID'] = id[index] value['Ontology_ID'] = concept_id value['Final_Score'] = syn_score + def_score value['Synonym_Score'] = syn_score value['Description_Score'] = def_score value['Title'] = title value['Number_of_Results'] = number_of_results value['Holder'] = score # Add extra metadata that may throw errors and add to dictionary # try: code = data['code'] value['CODE'] = code except: value['CODE'] = np.nan try: sab = data['sab'] value['SAB'] = sab except: value['SAB'] = np.nan try: scui = data['scui'] value['SCUI'] = scui except: value['SCUI'] = np.nan try: sdui = data['sdui'] value['SDUI'] = sdui except: value['SDUI'] = np.nan scores.append(value) # This code takes scores, (as it has metadata for only ONE uid) and finds the best match # # Get the best score, if scores has results (it maybe empty) # if scores: # Gets the dictionary with the highest score and it's corresponding data # best_score_data = max(scores, key=lambda x: x['Final_Score']) best_score = best_score_data['Holder'] results.append(best_score_data) # If best score is greater than or equal to the threshold, increase counter (a step closer to threshold) # if best_score >= self.score_threshold or threshold == 1: counter += 1 # If threshold is met, then return results # if counter == threshold: return results return results
def request(self): req = self._request if req.proxy: if req.proxy_userpwd: headers = make_headers(proxy_basic_auth=req.proxy_userpwd) else: headers = None proxy_url = '%s://%s' % (req.proxy_type, req.proxy) if req.proxy_type == 'socks5': pool = SOCKSProxyManager( proxy_url, cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) # , proxy_headers=headers) else: pool = ProxyManager(proxy_url, proxy_headers=headers, cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) else: pool = self.pool with self.wrap_transport_error(): # Retries can be disabled by passing False: # http://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#module-urllib3.util.retry # Do not use False because of warning: # Converted retries value: False -> Retry(total=False, # connect=None, read=None, redirect=0, status=None) retry = Retry( total=False, connect=False, read=False, redirect=0, status=None, ) # The read timeout is not total response time timeout # It is the timeout on read of next data chunk from the server # Total response timeout is handled by Grab timeout = Timeout(connect=req.connect_timeout, read=req.timeout) #req_headers = dict((make_unicode(x), make_unicode(y)) # for (x, y) in req.headers.items()) if six.PY3: req_url = make_unicode(req.url) req_method = make_unicode(req.method) else: req_url = make_str(req.url) req_method = req.method req.op_started = time.time() try: res = pool.urlopen(req_method, req_url, body=req.data, timeout=timeout, retries=retry, headers=req.headers, preload_content=False) except UnicodeError as ex: raise error.GrabConnectionError('GrabInvalidUrl', ex) #except exceptions.ReadTimeoutError as ex: # raise error.GrabTimeoutError('ReadTimeoutError', ex) #except exceptions.ConnectTimeoutError as ex: # raise error.GrabConnectionError('ConnectTimeoutError', ex) #except exceptions.ProtocolError as ex: # # TODO: # # the code # # raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1]) # # fails # # with error TypeError: 'OSError' object is not subscriptable # raise error.GrabConnectionError('ProtocolError', ex) #except exceptions.SSLError as ex: # raise error.GrabConnectionError('SSLError', ex) # WTF? self.request_head = b'' self.request_body = b'' self.request_log = b'' self._response = res
def __init__(self, error_dict): super(AppleDetailSpider, self).__init__(error_dict) self.market = 'apple' self.proxy = self.proxy_service.get_proxy('https') self.connection_pool = ProxyManager(self.proxy['https']) if self.proxy else PoolManager()
def get_terms(self, term, id, number_of_results): # Make API call to get json_data # term = self.lemmatize(self.preprocess(term)) # It stores a given score result that will be added to scores, then to results # json_dict = dict() # Base Query and More Proxy Management # base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" second_url = "esummary.fcgi?db=mesh&db=mesh&{query}&format=json" final_url = base_url + second_url.format(db=self.ontology, query="id=" + id) http = urllib3.PoolManager() http = ProxyManager("http://proxy.gtm.lilly.com:9000/") t.sleep(1) # Response data # response = http.request('GET', final_url) json_data = json.loads(response.data) uids = json_data['result']['uids'] # Holds a list of dictionaries, will be converted to dataframe # results = [] # Take the minimum of what the threshold is, versus the number of search hits # threshold = min(self.threshold, number_of_results) # Loop through each uid in the uids list # for uid in uids: # Keeps track of uids that score at or above the scoring requirement, used for pruning # counter = 0 # This represents json data from the UID that is CURRENTLY being looped through # json_section = json_data['result'][uid] # Check if ID is a disease # check_id = self.filter_by_disease(id, json_section) # If the search term is a disease... # if check_id: # Pure extracted data from json file before processing # scope_note = json_section["ds_scopenote"] mesh_id = json_section["ds_meshui"] mesh_terms = json_section["ds_meshterms"] # Intitialize score variables # score = None syn_score = None processed_term = self.stem(term) def_score = self.modified_jaccard_similarity(term, scope_note) # Keeps track of best scores for each uid # scores = [] # If there's only one search result, take it (regardless of score), and return it # # Adding it to just the scores list is fine since it's the only output # if threshold == 1: processed_mesh_term = self.stem( self.lemmatize(self.preprocess(mesh_terms[0]))) syn_score = fuzz.ratio( processed_mesh_term, processed_term ) if len(processed_term.split()) == 1 and len( processed_mesh_term) == 1 else self.jaccard_similarity( processed_mesh_term, processed_term) score = max(syn_score, def_score) json_dict = { 'Ontology': self.ontology, 'UID': uid, 'Ontology_ID': mesh_id, 'Disease_Input': term, "Synonym": mesh_terms[0], "Description": scope_note, 'Number_of_Results': number_of_results, 'Synonym_Score': syn_score, 'Description_Score': def_score, 'Final_Score': syn_score + def_score, 'Holder': score } scores.append(json_dict) return scores else: # Loop through each synonym in mesh_terms for scoring # for mesh_term in mesh_terms: # Prepare synonymn for levenstein distance matching (through fuzzy library) # processed_mesh_term = self.stem( self.lemmatize(self.preprocess(mesh_term))) syn_score = fuzz.ratio( processed_mesh_term, processed_term) if len( processed_term.split()) == 1 and len( processed_mesh_term ) == 1 else self.jaccard_similarity( processed_mesh_term, processed_term) # If term is only one word, just take the syn_score as its final score, otherwise take the max # score = syn_score if len(term.split()) == 1 else max( syn_score, def_score) # If the score is >= 60, add it to the scores list # json_dict = { 'Ontology': self.ontology, 'UID': uid, 'Ontology_ID': mesh_id, 'Disease_Input': term, "Synonym": mesh_term, "Description": scope_note, 'Number_of_Results': number_of_results, 'Synonym_Score': syn_score, 'Description_Score': def_score, 'Final_Score': syn_score + def_score, 'Holder': score } scores.append(json_dict) # This code takes scores, (as it has metadata for only ONE uid) and finds the best match # # Get the best score, if scores has results (it maybe empty) # if scores: # Gets the dictionary with the highest score and it's corresponding data # best_score_data = max(scores, key=lambda x: x['Final_Score']) best_score = best_score_data['Holder'] results.append(best_score_data) # If best score is greater than or equal to the threshold, increase counter (a step closer to threshold) # if best_score >= self.score_threshold or threshold == 1: counter += 1 # If threshold is met, then return results # if counter == threshold: return results return results