def get_web_page(url: str, proxies: list = None): headers = { 'User-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582", } if proxies: import numpy as np status = 0 i = 0 while status != 220 and i < 10: try: proxy_url = np.random.choice(proxies) print(f"Proxy: {proxy_url}") proxy_url = "https://" + url.strip() http = ProxyManager(proxy_url=proxy_url, headers=headers, cert_reqs='CERT_NONE', assert_hostname=False) resp = http.request('GET', url) status = resp.status print(status) except: i += 1 else: http = PoolManager(headers=headers, cert_reqs='CERT_NONE', assert_hostname=False) resp = http.request('GET', url) return resp.status, resp.data.decode('utf-8')
class NCBI_Authetication(): def __init__(self): self.authenticate() def authenticate(self): self.base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/" self.my_query = "PD-1%20ab%20agonist%5BTitle%2FAbstract%5D)%20AND%20(%222000%2F01%2F01%22%5BDate%20-%20Publication%5D%20%3A%20%223000%22%5BDate%20-%20Publication%5D" self.database = "pubmed" self.second_url = "esearch.fcgi?db={db}&term={query}&usehistory=y" self.final_url = self.base_url + self.second_url.format( db=self.database, query=self.my_query) self.http = ProxyManager("http://proxy.gtm.lilly.com:9000/") self.response = self.http.request('GET', self.final_url) self.http = ProxyManager("http://proxy.gtm.lilly.com:9000/") self.firstResponse = self.http.request('GET', self.final_url) self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" self.my_query = "id=29554659" self.database = "pubmed" self.second_url = "elink.fcgi?dbfrom=gene&db={db}&{query}" self.final_url = self.base_url + self.second_url.format( db=self.database, query=self.my_query) self.http = ProxyManager("http://proxy.gtm.lilly.com:9000/") self.secondResponse = self.http.request('GET', self.final_url) def get_response(self): return self.firstResponse, self.secondResponse
def get_internet_item(url, html=True): """ get html or data from given url :param url: target site url string :param html: download html or data boolean :return: html string """ if PROXY_MODE == "http_proxy": http = ProxyManager(proxy_url=PROXY_URL_PORT) elif PROXY_MODE == "auth_proxy": auth_proxy_headers = make_headers(proxy_basic_auth=PROXY_BASIC_AUTH) http = ProxyManager(proxy_url=PROXY_URL_PORT, proxy_headers=auth_proxy_headers, cert_reqs="CERT_REQUIRED", ca_certs=certifi.where()) else: http = PoolManager(cert_reqs="CERT_REQUIRED", ca_certs=certifi.where()) r = http.request("GET", url) if r.status != 200: raise ConnectionError("http request failure") if html: data = r.data.decode() else: data = r.data return data
def check_ip(ip_info, port_info, type): check_url = "https://bck.hermes.com/product-page?locale=us_en&productsku=H056289CC18" ip_url = "%s://%s:%s" % (type, ip_info, port_info) manager = ProxyManager(ip_url, timeout=10, cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) headers = util.make_headers( accept_encoding='gzip, deflate', keep_alive=True, user_agent= "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0" ) headers['Accept-Language'] = "en-US,en;q=0.5" headers['Connection'] = 'keep-alive' headers[ 'Accept'] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" try: response = manager.request('GET', check_url, preload_content=False, headers=headers) res = response.data print(res) json.loads(res) return True except Exception as ex: return False
class TorUtility(): def __init__(self): user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7' self.headers = {'User-Agent': user_agent} self.ip_url = 'http://icanhazip.com/' retries = Retry(connect=5, read=25, redirect=5) self.agent = ProxyManager( 'http://localhost:8118/', retries=retries, timeout=Timeout(total=60.0)) def renewTorIdentity(self, passAuth): try: s = socket.socket() s.connect(('localhost', 9051)) s.send('AUTHENTICATE "{0}"\r\n'.format(passAuth)) resp = s.recv(1024) if resp.startswith('250'): s.send("signal NEWNYM\r\n") resp = s.recv(1024) if resp.startswith('250'): logger.info("Identity renewed") else: logger.info("response 2:%s" % resp) else: logger.info("response 1:%s" % resp) except Exception as e: logger.error("Can't renew identity: %s" % e) def renew_connection(self): with Controller.from_port(port=9051) as controller: controller.authenticate('natalie') controller.signal(Signal.NEWNYM) logger.info('*' * 50) logger.info('\t' * 6 + 'Renew TOR IP: %s' % self.request(self.ip_url)) logger.info('*' * 50) def request(self, url): r = self.agent.request('GET', url) if r.status == 200: return r.data elif r.status == 403: self.renew_connection() else: logger.error('status %s' % r.status) return '' def current_ip(self): return self.request(self.ip_url)
class TorUtility(): def __init__(self): user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7' self.headers = {'User-Agent': user_agent} self.ip_url = 'http://icanhazip.com/' self.logger = logging.getLogger('gkp') retries = Retry(connect=5, read=5, redirect=5) self.agent = ProxyManager('http://localhost:8118/', retries=retries, timeout=Timeout(total=60.0)) def renewTorIdentity(self, passAuth): try: s = socket.socket() s.connect(('localhost', 9051)) s.send('AUTHENTICATE "{0}"\r\n'.format(passAuth)) resp = s.recv(1024) if resp.startswith('250'): s.send("signal NEWNYM\r\n") resp = s.recv(1024) if resp.startswith('250'): self.logger.info("Identity renewed") else: self.logger.info("response 2:%s" % resp) else: self.logger.info("response 1:%s" % resp) except Exception as e: self.logger.error("Can't renew identity: %s" % e) def renew_connection(self): with Controller.from_port(port=9051) as controller: controller.authenticate('natalie') controller.signal(Signal.NEWNYM) self.logger.info('*' * 50) self.logger.info('\t' * 6 + 'Renew TOR IP: %s' % self.request(self.ip_url)) self.logger.info('*' * 50) def request(self, url): r = self.agent.request('GET', url) if r.status == 200: return r.data elif r.status == 403: self.renew_connection() else: self.logger.error('status %s' % r.status) return '' def current_ip(self): return self.request(self.ip_url)
def run_proxy(url): global lock #print(lock.locked()) if(prox == ''): print('No proxys available.') return run(url) print('Proxy: ' + prox) http = ProxyManager(prox) try: data = {'attribute': 'value'} encoded_data = json.dumps(data).encode('utf-8') req = http.request( 'POST', url, timeout = 3, body=encoded_data, headers={'Content-Type': 'html/text'}) print(req.status) if(req.status == 404): print('Item Does not exist.') #return run(url) return if(req.status == 501): print('Proxy at api call limit') get_new_proxy() return run_proxy(url) if(req.status == 407): print('Authentication required') get_new_proxy() return run_proxy(url) if(req.status != 200): print('Unknown Status Code') print(req.status) get_new_proxy() return run_proxy(url) except: print('Request timed out.') get_new_proxy() return run(url) data = json.loads(req.data) req.release_conn() data = data['item'] id = str(data['id']) print('ID: ' + id) file = open('ItemIds','a') file.write(id + '\n') file.close()
def get_uids(self, term): base_url = "https://www.ncbi.nlm.nih.gov/medgen/?term=" term = term.replace(" ", "+") final_url = base_url + term http = urllib3.PoolManager() http = ProxyManager("http://proxy.gtm.lilly.com:9000/") response = http.request('GET', final_url) soup = BeautifulSoup(response.data, 'lxml') pattern = "<dd>[0-9]*</dd>" p = re.compile(pattern) ids = p.findall(str(soup)) ids = [ id.replace("<dd>", "").replace("</dd>", "").strip() for id in ids ] return ids
class Downloader: def __init__(self, proxy_list): self.__proxyCounter = 0 self.__proxyList = proxy_list self.__http = ProxyManager("http://" + self.__proxyList[self.__proxyCounter]) def try_download(self, url, tries=0): try: r = self.__http.request('GET', url) except: if tries > 2: print("To many tries, updating proxy...") self.update_proxy() r = self.try_download(url) else: print("Error while downloading from \'%s\'. Trying again in 3 secs... [%d]" % (url, tries + 1)) time.sleep(3) r = self.try_download(url, tries + 1) return r def update_proxy(self): self.__proxyCounter += 1 if self.__proxyCounter >= len(proxyList): self.__proxyCounter = 0 self.__http = ProxyManager("http://" + self.__proxyList[self.__proxyCounter]) def download_to_file(self, url, file_adress, tries=0): print("Start downloading from: '{0}'".format(url)) r = self.try_download(url) if r.status == 200: print("Downloaded. Saving to '{0}'".format(file_adress)) f = open(file_adress, 'wb') f.write(r.data) f.close() elif r.status // 100 == 5: print("Something wrong with server (%s). Waiting 2 secs and trying again... [%d]" % (r.status, tries + 1)) time.sleep(2) if tries < 5: self.download_to_file(url, file_adress, tries + 1) else: print("Too many tries. Aborting! Try to start update later") return -1 else: print("Wrong response status: {0}".format(r.status))
def _check(self, protocol, proxy_url_set): valid_proxy_url_set = set() for url in proxy_url_set: header = {'content-type': 'text/html', 'User-Agent': user_agents[random.randint(0, len(user_agents)-1)]} proxy = {protocol: url} conection_pool = ProxyManager(url) try: response = conection_pool.request('GET', CHECK_URL[protocol], timeout=60, headers=header) if response.status == 200: valid_proxy_url_set.add(url) print 'Valid proxy url', url else: print 'Invalid ', url except Exception as ex: print ex print 'Invalid ', url return valid_proxy_url_set
def get_uids(self, term): # Base Query and More Proxy Management # base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" term = self.preprocess(term).replace(" ", "+") second_url = "esearch.fcgi?db={db}&term={query}&retmax=100&format=json" final_url = base_url + second_url.format(db=self.ontology, query=term) http = urllib3.PoolManager() http = ProxyManager("http://proxy.gtm.lilly.com:9000/") t.sleep(1) # Response data # response = http.request('GET', final_url) json_data = json.loads(response.data) # Updates number of search results # self.get_counts(int(json_data['esearchresult']['count'])) # Returns ID List # return json_data['esearchresult']['idlist']
def fetch_title(proxy: urllib3.ProxyManager, title_id) -> dict: url = f"https://mangadex.org/api/v2/manga/{title_id}" scrape_id = db.run_sql( "INSERT INTO scrape (proxy, url) VALUES (?, ?)", (proxy.proxy_url, url), return_last_insert_rowid=True, ) resp = proxy.request("GET", url) assert resp.status in [200, 404], resp.data db.run_sql( """ UPDATE scrape SET resp_status = ?, resp_body = ?, ended_at = datetime('now') WHERE id = ?; """, (resp.status, resp.data, scrape_id), ) print("Saved title", title_id, "-", resp.status)
class AppleDetailSpider(AppDetailSpider): def __init__(self, error_dict): super(AppleDetailSpider, self).__init__(error_dict) self.market = 'apple' self.proxy = self.proxy_service.get_proxy('https') self.connection_pool = ProxyManager(self.proxy['https']) if self.proxy else PoolManager() @retry(2) def _scrape_market(self, app_id): scrape_url = APPLE_APP_URL.format(app_id=app_id) header = {'content-type': 'text/html', 'User-Agent': user_agents[random.randint(0, len(user_agents)-1)]} try: response = self.connection_pool.request('GET', scrape_url, timeout=60, retries=2, headers=header) if response: content = response.data if len(content) > REJECT_PAGE_SIZE: if len(content) > NORMAL_APP_PAGE_SIZE: self.proxy_service.manage(self.proxy, False) print 'Succeed scrape app', app_id logger.info('Succeed scrape app {}'.format(app_id)) return content else: print 'Invalid app', app_id logger.info('Invalid app {}'.format(app_id)) else: logger.info('Reject visit app {}, use proxy {}'.format(app_id, self.proxy)) raise Exception('Reject visit app {}'.format(app_id)) else: raise Exception('Response is None') except Exception as ex: self.proxy_service.manage(self.proxy, True) self.proxy = self.proxy_service.get_proxy('https') self.connection_pool = ProxyManager(self.proxy['https']) if self.proxy else PoolManager() raise ex
def get_ip(self): http = ProxyManager('http://127.0.0.1:8118') body = http.request('GET', 'http://icanhazip.com') return str(body.data, 'utf-8').replace('\n', '')
def get_terms(self, term, id, number_of_results): # Make API call to get json_data # term = self.lemmatize(self.preprocess(term)) # It stores a given score result that will be added to scores, then to results # json_dict = dict() # Base Query and More Proxy Management # base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" second_url = "esummary.fcgi?db=mesh&db=mesh&{query}&format=json" final_url = base_url + second_url.format(db=self.ontology, query="id=" + id) http = urllib3.PoolManager() http = ProxyManager("http://proxy.gtm.lilly.com:9000/") t.sleep(1) # Response data # response = http.request('GET', final_url) json_data = json.loads(response.data) uids = json_data['result']['uids'] # Holds a list of dictionaries, will be converted to dataframe # results = [] # Take the minimum of what the threshold is, versus the number of search hits # threshold = min(self.threshold, number_of_results) # Loop through each uid in the uids list # for uid in uids: # Keeps track of uids that score at or above the scoring requirement, used for pruning # counter = 0 # This represents json data from the UID that is CURRENTLY being looped through # json_section = json_data['result'][uid] # Check if ID is a disease # check_id = self.filter_by_disease(id, json_section) # If the search term is a disease... # if check_id: # Pure extracted data from json file before processing # scope_note = json_section["ds_scopenote"] mesh_id = json_section["ds_meshui"] mesh_terms = json_section["ds_meshterms"] # Intitialize score variables # score = None syn_score = None processed_term = self.stem(term) def_score = self.modified_jaccard_similarity(term, scope_note) # Keeps track of best scores for each uid # scores = [] # If there's only one search result, take it (regardless of score), and return it # # Adding it to just the scores list is fine since it's the only output # if threshold == 1: processed_mesh_term = self.stem( self.lemmatize(self.preprocess(mesh_terms[0]))) syn_score = fuzz.ratio( processed_mesh_term, processed_term ) if len(processed_term.split()) == 1 and len( processed_mesh_term) == 1 else self.jaccard_similarity( processed_mesh_term, processed_term) score = max(syn_score, def_score) json_dict = { 'Ontology': self.ontology, 'UID': uid, 'Ontology_ID': mesh_id, 'Disease_Input': term, "Synonym": mesh_terms[0], "Description": scope_note, 'Number_of_Results': number_of_results, 'Synonym_Score': syn_score, 'Description_Score': def_score, 'Final_Score': syn_score + def_score, 'Holder': score } scores.append(json_dict) return scores else: # Loop through each synonym in mesh_terms for scoring # for mesh_term in mesh_terms: # Prepare synonymn for levenstein distance matching (through fuzzy library) # processed_mesh_term = self.stem( self.lemmatize(self.preprocess(mesh_term))) syn_score = fuzz.ratio( processed_mesh_term, processed_term) if len( processed_term.split()) == 1 and len( processed_mesh_term ) == 1 else self.jaccard_similarity( processed_mesh_term, processed_term) # If term is only one word, just take the syn_score as its final score, otherwise take the max # score = syn_score if len(term.split()) == 1 else max( syn_score, def_score) # If the score is >= 60, add it to the scores list # json_dict = { 'Ontology': self.ontology, 'UID': uid, 'Ontology_ID': mesh_id, 'Disease_Input': term, "Synonym": mesh_term, "Description": scope_note, 'Number_of_Results': number_of_results, 'Synonym_Score': syn_score, 'Description_Score': def_score, 'Final_Score': syn_score + def_score, 'Holder': score } scores.append(json_dict) # This code takes scores, (as it has metadata for only ONE uid) and finds the best match # # Get the best score, if scores has results (it maybe empty) # if scores: # Gets the dictionary with the highest score and it's corresponding data # best_score_data = max(scores, key=lambda x: x['Final_Score']) best_score = best_score_data['Holder'] results.append(best_score_data) # If best score is greater than or equal to the threshold, increase counter (a step closer to threshold) # if best_score >= self.score_threshold or threshold == 1: counter += 1 # If threshold is met, then return results # if counter == threshold: return results return results
def update_db(self, year): filename = CVE_FEED_FILENAME.replace('$$$$', year) + '.json' file_path = path.join(CACHE_PATH, filename) meta_filename = CVE_FEED_FILENAME.replace('$$$$', year) + '.meta' meta_file_path = path.join(CACHE_PATH, year + '.meta') if environ.get('http_proxy') is not None: http = ProxyManager(environ.get('http_proxy'), maxsize=10) else: http = PoolManager() disable_warnings(urllib3_exceptions.InsecureRequestWarning) r = None meta = None try: r = http.request('GET', CVE_FEED_URL + meta_filename, preload_content=False) except Exception as e: print("[!] Error obtaining CVE meta data: " + str(e)) if path.isfile(meta_file_path): with open(meta_file_path, 'r') as myfile: meta = myfile.read() if r is not None and meta is not None and r.data.decode( 'utf-8').replace('\r', '') == meta: return else: if r is not None: with open(meta_file_path, 'wb') as out_file: copyfileobj(r, out_file) try: with http.request('GET', CVE_FEED_URL + filename + '.zip', preload_content=False) as r, open( file_path + '.zip', 'wb') as out_file: copyfileobj(r, out_file) except Exception as e: print("[!] Error downloading CVE feed: " + str(e)) return try: archive = ZipFile(file_path + '.zip', 'r') xml_data = archive.extract(filename, CACHE_PATH) except Exception as e: print("[!] Error extracting the CVE archive: " + str(e)) return cve_cache = [] actions = [] count = 0 with open(file_path, encoding='utf-8') as data_file: data = json.loads(data_file.read())["CVE_Items"] for i in data: item = cve_item() item.id = i["cve"]["CVE_data_meta"]["ID"] for j in i['cve']['references']['reference_data']: item.references.append(j) item.summary = i['cve']['description']['description_data'][0][ "value"] for j in i['configurations']['nodes']: if 'cpe' in j: for k in j['cpe']: item.affected.append({ "vuln": k['vulnerable'], "cpe22": k['cpe22Uri'], "cpe23": k['cpe23Uri'], "vStartE": k.get('versionStartExcluding', ''), "vStartI": k.get('versionStartIncluding', ''), "vEndE": k.get('versionEndExcluding', ''), "vEndI": k.get('versionEndIncluding', '') }) elif 'children' in j: for t in j['children']: if 'cpe' in t: for k in t['cpe']: item.affected.append({ "vuln": k['vulnerable'], "cpe22": k['cpe22Uri'], "cpe23": k['cpe23Uri'], "vStartE": k.get('versionStartExcluding', ''), "vStartI": k.get('versionStartIncluding', ''), "vEndE": k.get('versionEndExcluding', ''), "vEndI": k.get('versionEndIncluding', '') }) if 'baseMetricV3' in i['impact']: item.cvss['vector_string_v3'] = i['impact']['baseMetricV3'][ 'cvssV3']['vectorString'] item.cvss['score_v3'] = i['impact']['baseMetricV3']['cvssV3'][ 'baseScore'] if 'baseMetricV2' in i['impact']: item.cvss['vector_string_v2'] = i['impact']['baseMetricV2'][ 'cvssV2']['vectorString'] item.cvss['score_v2'] = i['impact']['baseMetricV2']['cvssV2'][ 'baseScore'] item.published = i['publishedDate'] item.last_modified = i['lastModifiedDate'] cve_cache.append(item) if USE_ELASTIC_SEARCH: actions.append({ "_index": "cve-" + year, "_type": "vulns", "_source": { 'cve_id': item.id, 'summary': item.summary, 'published': item.published, 'last_modified': item.last_modified, 'score_v3': item.cvss.get('score_v3', 0), 'score_v2': item.cvss.get('score_v2', 0), 'vector_string_v2': item.cvss.get('vector_string_v2', 'NA'), 'vector_string_v3': item.cvss.get('vector_string_v3', 'NA'), 'affected': item.affected, 'cache-index': count, } }) count = count + 1 if USE_ELASTIC_SEARCH is True: try: if self.es.indices.exists(index="cve-" + year): self.es.indices.delete(index='cve-' + year, ignore=[400, 404], request_timeout=60) mappings = { "mappings": { "vulns": { "properties": { "cve_id": { "type": "keyword" }, "score_v2": { "type": "float" }, "score_v3": { "type": "float" }, "affected": { "type": "nested", "properties": { "cpe22": { "type": "keyword" }, "cpe23": { "type": "keyword" }, "vStartE": { "type": "keyword" }, "vStartI": { "type": "keyword" }, "vEndE": { "type": "keyword" }, "vEndI": { "type": "keyword" } } } } } } } self.es.indices.create(index="cve-" + year, ignore=400, body=mappings) self.helpers.bulk(self.es, actions, request_timeout=60) except Exception as e: print("[!] Elasticsearch indexing error: " + str(e)) try: dump(cve_cache, open(path.join(CACHE_PATH, year + '.db'), "wb"), HIGHEST_PROTOCOL) remove(file_path + '.zip') remove(file_path) except PickleError as e: print("[!] Error while caching CVE data: " + str(e))
def update_db(self): if environ.get('http_proxy') is not None: http = ProxyManager(environ.get('http_proxy'), maxsize=10) else: http = PoolManager() disable_warnings(urllib3_exceptions.InsecureRequestWarning) r = None meta = None try: r = http.request('GET', FEED_URL + 'official-cpe-dictionary_v2.3.meta', preload_content=False) except Exception as e: print("[!] Error obtaining CPE dictionary meta data: " + str(e)) if path.isfile(self.meta_file_path): with open(self.meta_file_path, 'r') as myfile: meta = myfile.read() if r is not None and r.data.decode('utf-8').replace('\r', '') == meta: return else: if r is not None: with open(self.meta_file_path, 'wb') as out_file: copyfileobj(r, out_file) try: with http.request( 'GET', FEED_URL + 'official-cpe-dictionary_v2.3.xml.zip', preload_content=False) as r, open(self.zipfile_location, 'wb') as out_file: copyfileobj(r, out_file) except Exception as e: print("[!] Error downloading CPE dictionary: " + str(e)) return try: archive = ZipFile(self.zipfile_location, 'r') xml_data = archive.extract('official-cpe-dictionary_v2.3.xml', CACHE_PATH) except Exception as e: print("[!] Error extracting the CPE archive: " + str(e)) return try: root = ET.parse(self.cpe_dictionary_filename).getroot() except ET.ParseError as e: print("[!] Error while parsing CPE dictionary: " + str(e)) return cpe_dictionary = [] cpe_names = {} actions = [] count = 0 for i in root.getchildren()[1:]: item = cpe_item() item.name = i.attrib['name'] try: if i.attrib['deprecated'] is 'true': item.deprecated = True except: item.deprecated = False for j in i.getchildren(): if 'title' in j.tag: item.title = j.text elif 'references' in j.tag: for k in j.getchildren(): item.ref[k.attrib['href']] = k.text elif 'cpe23-item' in j.tag: item.wfs = j.attrib['name'] cpe_names[item.name] = item.title if USE_ELASTIC_SEARCH: actions.append({ "_index": "cpe-names", "_type": "names", "_source": { 'cpe_id': item.name, 'title': item.title, 'vendor': item.get_vendor(), 'product': item.get_product(), 'version': item.get_version(), 'wfs': item.wfs, 'cache-index': count, } }) count = count + 1 cpe_dictionary.append(item) if USE_ELASTIC_SEARCH: try: if self.es.indices.exists(index="cpe-names"): self.es.indices.delete(index='cpe-names', ignore=[400, 404], request_timeout=60) mappings = { "mappings": { "names": { "properties": { "cpe_id": { "type": "keyword" }, "wfs": { "type": "keyword" }, "product": { "type": "keyword" }, "version": { "type": "keyword" }, "vendor": { "type": "keyword" } } } } } self.es.indices.create(index="cpe-names", ignore=400, body=mappings) self.helpers.bulk(self.es, actions, request_timeout=60) except Exception as e: print("[!] Elasticsearch indexing error: " + str(e)) try: dump(cpe_dictionary, open(self.cpe_cache_filename, "wb"), HIGHEST_PROTOCOL) dump(cpe_names, open(self.cpe_names_filename, "wb"), HIGHEST_PROTOCOL) remove(self.cpe_dictionary_filename) remove(self.zipfile_location) except PickleError as e: print("[!] Error while caching CPE data: " + str(e))
print("Importation OK") index = 0 for username in usernames: index += 1 # On fait la requete req = 'https://www.instagram.com/' + username + '/?__a=1' if len(proxy_list) >= 1: # On choisi le proxy au hasard proxy_number = random.randint(0, len(proxy_list) - 1) http = ProxyManager("http://" + proxy_list[proxy_number] + "/") else: http = PoolManager() webpage = http.request('GET', req) try: test = json.loads(webpage.data) webpage = str(webpage.data) except Exception as e: if len(proxy_list) >= 1: print("Ce proxy est cramé : " + proxy_list[proxy_number]) proxy_list.remove(proxy_list[proxy_number]) if len(proxy_list) < 1: print("Il n'y a plus de proxy disponible") with open(dest_file, "w") as output: writer = csv.writer(output, lineterminator='\n') for val in infos: writer.writerow([val])
def isight_load_data(a_url, a_query, a_headers): """ :param a_url: :type a_url: :param a_query: :type a_query: :param a_headers: :type a_headers: :return: :rtype: """ try: PySight_settings.logger.debug("param headers: %s %s", a_headers, a_url) proxy_request = ProxyManager(str(PySight_settings.proxy_adress)) url_to_load = PySight_settings.isight_url + a_query PySight_settings.logger.debug(url_to_load) try: r = proxy_request.request('GET', a_url + a_query, None, headers=a_headers) except urllib.error.HTTPError as e: print(e.code) print(e.read()) PySight_settings.logger.debug("headers %s: ", proxy_request.headers) PySight_settings.logger.debug("data %s: ", r.data) return_data_cleaned = r.data.replace('\n', '') # return_data_cleaned = json_return_data_cleaned = json.loads( return_data_cleaned.decode('utf8')) PySight_settings.logger.debug(json_return_data_cleaned) # print json.dumps(theJson,sort_keys=True,indent = 4, separators = (',', ': ')) PySight_settings.logger.debug("Number of iocs: %s answer is: %s", len(json_return_data_cleaned['message']), json_return_data_cleaned) if not json_return_data_cleaned['success']: PySight_settings.logger.error( "Error with iSight connection %s", json_return_data_cleaned['message']['description']) PySight_settings.logger.error(json_return_data_cleaned) return False else: import time timestring = time.strftime("%Y%m%d-%H%M%S") f = open("debug/" + timestring, 'w') f.write( json.dumps(json_return_data_cleaned, sort_keys=True, indent=6, separators=(',', ': '))) f.close() return json_return_data_cleaned except: print("Unexpected error: %s", sys.exc_info()) return False
def get_terms(self, term, id, id_string, number_of_results, is_match=False): # Make API call to get xml data # term = self.lemmatize(self.preprocess(term)) # Proxy Code and Base Query # base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" second_url = "esummary.fcgi?db=medgen&db=medgen&{query}" final_url = base_url + second_url.format(db=self.ontology, query="id=" + id_string) http = urllib3.PoolManager() http = ProxyManager("http://proxy.gtm.lilly.com:9000/") t.sleep(1) response = http.request('GET', final_url) soup = BeautifulSoup(response.data, 'lxml') # Get the separate hits in lists # hits = soup.find_all('documentsummary') # Dictionary to store the results # results = [] # Set threshold, take the min of the threshold requested and the total number of search results # threshold = min(self.threshold, number_of_results) # For every hit (each hit represents data from ONE UID) # for hit in hits: # Keeps track of meeting the threshold # counter = 0 # Check if return is a disease # check = "Blank" if not len(hit.find("semanticid")) else hit.find( "semanticid").text.strip() # List of acceptable semantic types # semantic_types = [ 'T191', 'T047', 'T048', 'T019', 'T190', 'T033', 'T049', 'T046', 'T184', "Blank" ] # If term is a disease, execute the following: # if check in semantic_types: # Get Concept ID # concept_id = "Blank" if not len(hit.find( 'conceptid')) else hit.find('conceptid').text.strip() # Get Title # title = hit.find('title').text.strip() # Get name tags for looping # name_tags = hit.find_all('name') # Get definition/description # definition = hit.find('definition').text.strip() def_score = self.modified_jaccard_similarity(term, definition) # Get SAB, CODE, SCUI, SDUI, and Title # processed_term = self.stem(term) new_title = self.stem(self.lemmatize(self.preprocess(title))) # Keeps track of best scores for each uid # scores = [] # Loop through synonyms # for data in name_tags: # Get the max syn_score between a synonym and the title # new_text = self.stem( self.lemmatize(self.preprocess(data.text))) syn_score = max(fuzz.ratio(new_text, processed_term), fuzz.ratio(processed_term, new_title)) syn_score = max( fuzz.ratio(new_text, processed_term), fuzz.ratio(processed_term, new_title) ) if len(new_text.split()) == 1 and len( new_title.split()) == 1 and len(processed_term.split( )) == 1 else self.jaccard_similarity( new_text, processed_term) # If score is 100 or the term is one word, take the syn_score # score = syn_score if len( term.split()) == 1 or syn_score == 100 else max( syn_score, def_score) # Intialize dictionary to add to results # value = dict() code, sab, scui, sdui = None, None, None, None index = hits.index(hit) # Add Basic Data MetaData to Dictionary # value['Disease_Input'] = term value['Ontology'] = self.ontology value['Synonym'] = data.text value['Description'] = definition value['Semantic_Type'] = check value['UID'] = id[index] value['Ontology_ID'] = concept_id value['Final_Score'] = syn_score + def_score value['Synonym_Score'] = syn_score value['Description_Score'] = def_score value['Title'] = title value['Number_of_Results'] = number_of_results value['Holder'] = score # Add extra metadata that may throw errors and add to dictionary # try: code = data['code'] value['CODE'] = code except: value['CODE'] = np.nan try: sab = data['sab'] value['SAB'] = sab except: value['SAB'] = np.nan try: scui = data['scui'] value['SCUI'] = scui except: value['SCUI'] = np.nan try: sdui = data['sdui'] value['SDUI'] = sdui except: value['SDUI'] = np.nan scores.append(value) # This code takes scores, (as it has metadata for only ONE uid) and finds the best match # # Get the best score, if scores has results (it maybe empty) # if scores: # Gets the dictionary with the highest score and it's corresponding data # best_score_data = max(scores, key=lambda x: x['Final_Score']) best_score = best_score_data['Holder'] results.append(best_score_data) # If best score is greater than or equal to the threshold, increase counter (a step closer to threshold) # if best_score >= self.score_threshold or threshold == 1: counter += 1 # If threshold is met, then return results # if counter == threshold: return results return results
class MainWindow(QWebView): def __init__(self): super().__init__() self.f = open('proxi.txt', 'r') self.initUI() def initUI(self): # self.showFullScreen() self.i = 0 self.prox = 0 self.a = 0 self.setWindowTitle('Safari') # QNetworkProxy.setApplicationProxy(QNetworkProxy(QNetworkProxy.HttpProxy, "173.192.21.89", 25)) # QNetworkProxy.setApplicationProxy(QNetworkProxy(QNetworkProxy.HttpProxy, "88.159.123.151", 80)) # QNetworkProxy.setApplicationProxy(QNetworkProxy(QNetworkProxy.HttpProxy, "138.197.137.90", 3128)) # QNetworkProxy.setApplicationProxy(QNetworkProxy(QNetworkProxy.HttpProxy, "94.177.175.232", 3128)) # self.load(QUrl('https://www.youtube.com/watch?v=-oV8nNeLpjY&list=PLmahvFaEUuOmzu6t8rZ4SUz3ECAC2jQf5')) # self.load(QUrl('https://youtu.be/-KqdN1uScHc')) #self.load(QUrl('https://vk.com/video447929742_456239017')) self.load(QUrl('https://www.youtube.com')) self.show() self.loadFinished.connect(self.prow) def prow(self): if self.a == 0: self.a = 1 self.fin() def fin(self): print('Вход в цикл программы') self.i = 0 while self.i<500: time.sleep(0.1) QtWidgets.qApp.processEvents() self.i = self.i + 1 if self.prox == 0: QNetworkProxy.setApplicationProxy(QNetworkProxy(QNetworkProxy.HttpProxy, "138.197.137.90", 3128)) elif self.prox == 1: QNetworkProxy.setApplicationProxy(QNetworkProxy(QNetworkProxy.HttpProxy, "35.185.80.76", 3128)) elif self.prox == 2: QNetworkProxy.setApplicationProxy(QNetworkProxy(QNetworkProxy.HttpProxy, "212.237.15.178", 80)) elif self.prox == 3: QNetworkProxy.setApplicationProxy(QNetworkProxy(QNetworkProxy.HttpProxy, "104.236.27.71", 80)) elif self.prox == 4: QNetworkProxy.setApplicationProxy(QNetworkProxy(QNetworkProxy.HttpProxy, "144.217.100.67", 80)) elif self.prox == 5: QNetworkProxy.setApplicationProxy(QNetworkProxy(QNetworkProxy.HttpProxy, "94.177.175.232", 80)) elif self.prox == 6: QNetworkProxy.setApplicationProxy(QNetworkProxy(QNetworkProxy.HttpProxy, "149.56.42.236", 80)) elif self.prox == 7: QNetworkProxy.setApplicationProxy(QNetworkProxy(QNetworkProxy.HttpProxy, "35.185.80.76", 3128)) elif self.prox == 8: QNetworkProxy.setApplicationProxy(QNetworkProxy(QNetworkProxy.HttpProxy, "162.243.140.150", 8000)) elif self.prox == 9: QNetworkProxy.setApplicationProxy(QNetworkProxy(QNetworkProxy.HttpProxy, "177.67.84.157", 8080)) elif self.prox == 10: QNetworkProxy.setApplicationProxy(QNetworkProxy(QNetworkProxy.HttpProxy, "192.208.184.134", 8080)) elif self.prox == 11: QNetworkProxy.setApplicationProxy(QNetworkProxy(QNetworkProxy.HttpProxy, "151.80.152.121", 8080)) elif self.prox == 12: QNetworkProxy.setApplicationProxy(QNetworkProxy(QNetworkProxy.HttpProxy, "212.237.15.178", 8080)) else: self.prox = -1 print('Использованное proxi -', self.prox) print('Загрузка сайта') self.a = 0 self.load(QUrl('https://youtu.be/-KqdN1uScHc')) self.prox = self.prox + 1 print('Загрузился типо') # def mouseReleaseEvent(self, e): # print("xlixk") def finished(self): self.fin = self.fin + 1 print("Load Finish") self.loading() def loading(self): QtWidgets.qApp.processEvents() if (self.fin > 0): self.fin = 0 self.newsite() def newsite(self): while not self.request(): QtWidgets.qApp.processEvents() print('Proxy bad') QtWidgets.qApp.processEvents() QNetworkProxy.setApplicationProxy(QNetworkProxy(QNetworkProxy.HttpProxy, self.proxi_ip, self.proxi_port)) self.load(QUrl('https://vk.com/id447929742?z=video447929742_456239017%2F2819f4855e1e422801%2Fpl_wall_447929742')) return True def request(self): QtWidgets.qApp.processEvents() self.proxi() print(self.stroka2) self.prm = ProxyManager(str(self.stroka2)) print(self.stroka2) try: QtWidgets.qApp.processEvents() r = self.prm.request('GET', 'https://www.yandex.ru/') except: return False return True def proxi(self): self.stroka = self.f.readline() self.stroka2 = 'http://' + self.stroka self.stroka2 = self.stroka2[:len(self.stroka2)-1] print(self.stroka) QtWidgets.qApp.processEvents() self.massiv = self.stroka.split(':') self.proxi_ip = str(self.massiv[0]) self.proxi_port = int((self.massiv[1])[:len(self.massiv[1]) - 1]) QtWidgets.qApp.processEvents() print(self.proxi_ip) print(self.proxi_port)