def _init(self, path_to_tx=None): instructions = "Run 'tx init' to initialize your project first!" try: self.root = self._get_tx_dir_path(path_to_tx) self.config_file = self._get_config_file_path(self.root) self.config = self._read_config_file(self.config_file) local_txrc_file = self._get_transifex_file(os.getcwd()) if os.path.exists(local_txrc_file): self.txrc_file = local_txrc_file else: self.txrc_file = self._get_transifex_file() self.txrc = self._get_transifex_config([self.txrc_file, ]) except ProjectNotInit as e: logger.error('\n'.join([six.u(str(e)), instructions])) raise host = self.config.get('main', 'host') if host.lower().startswith('https://'): self.conn = urllib3.connection_from_url( host, cert_reqs=ssl.CERT_REQUIRED, ca_certs=certs_file() ) else: self.conn = urllib3.connection_from_url(host)
def make_request(method, host, url, username, password, fields=None): if host.lower().startswith('https://'): connection = urllib3.connection_from_url( host, cert_reqs=ssl.CERT_REQUIRED, ca_certs=certs_file() ) else: connection = urllib3.connection_from_url(host) headers = urllib3.util.make_headers( basic_auth='{0}:{1}'.format(username, password), accept_encoding=True, user_agent=user_agent_identifier(), keep_alive=True ) r = None try: r = connection.request(method, url, headers=headers, fields=fields) data = r.data charset = determine_charset(r) if isinstance(data, bytes): data = data.decode(charset) if r.status < 200 or r.status >= 400: if r.status == 404: raise HttpNotFound(data) else: raise Exception(data) return data, charset except SSLError: logger.error("Invalid SSL certificate") raise finally: if not r is None: r.close()
def _init(self, path_to_tx=None): instructions = "Run 'tx init' to initialize your project first!" try: self.root = self._get_tx_dir_path(path_to_tx) self.config_file = self._get_config_file_path(self.root) self.config = self._read_config_file(self.config_file) local_txrc_file = self._get_transifex_file(os.getcwd()) if os.path.exists(local_txrc_file): self.txrc_file = local_txrc_file else: self.txrc_file = self._get_transifex_file() self.txrc = self._get_transifex_config([ self.txrc_file, ]) except ProjectNotInit as e: logger.error('\n'.join([six.u(str(e)), instructions])) raise host = self.config.get('main', 'host') if host.lower().startswith('https://'): self.conn = urllib3.connection_from_url(host, cert_reqs=CERT_REQUIRED, ca_certs=web.certs_file()) else: self.conn = urllib3.connection_from_url(host)
def parse_feed_parallel(num, feed_options_item, all_links, queue, t_limit=None): """ Parallel creation of a RSSItem for each post in the feed. :param num: The feed's number in the list. For DEBUG purposes :param feed_options_item: The RSS Feed options :param all_links: A set of all the links in the database :param queue: A Queue to store the resulting RSSPost objects :param t_limit: An integer used to limit the number of running threads """ t1 = millis() # Read the feed XML and store it as a string try: a = urllib.urlopen(feed_options_item.feed_url).read() except IOError as e: logger.error("Getting XML for feed %s failed. No posts from this feed will be processed" % feed_options_item.feed_url) return d = speedparser.parse(a, clean_html=False) # SpeedParser is ~10 times faster than FeedParser t2 = millis() logger.debug("%d %s with %d posts, SpeedParser done in: %d ms" % (num, feed_options_item.feed_url, len(d.entries), (t2-t1))) # Create a thread for each entry in the feed which is not present in the database threads = [] http = None if 'feedburner' in feed_options_item.feed_url: # Get the host of the first original link http = urllib3.connection_from_url(d.entries[0].get("id", d.entries[0].link), maxsize=40, block=True) else: # Got maxsize=40 experimentally as best value http = urllib3.connection_from_url(feed_options_item.feed_url, maxsize=40, block=True) # Fill threads list for entry in d.entries: if 'feedproxy.google' in entry.link: # FeedProxy workaround if entry.get("id", entry.link) not in all_links: threads.append(threading.Thread(target=get_html3, args=(http, entry, feed_options_item, queue))) else: if entry.link not in all_links: threads.append(threading.Thread(target=get_html3, args=(http, entry, feed_options_item, queue))) # Run threads depending on thread limit if t_limit: for i in range(0, len(threads), t_limit): for j in range(min(t_limit, len(threads) - i)): threads[i+j].start() for j in range(min(t_limit, len(threads) - i)): threads[i+j].join() # If t_limit is None, run all threads at once else: for t in threads: t.start() for t in threads: t.join()
def check_internet(): """Check Internet Connectivity""" try: urllib3.connection_from_url('https://draugeros.org', timeout=1) return True except: return False return False
def __init__(self, api_host, api_mobile_host, customer_id, secret_key, ssl=True): self._customer_id = customer_id self._secret_key = secret_key self._api_host = api_host self._api_mobile_host = api_mobile_host http_root = "https://" if ssl else "http://" self._pool = urllib3.connection_from_url(http_root + api_host) self._pool_mobile = urllib3.connection_from_url(http_root + api_mobile_host)
def create_http_pool(settings): url = settings['notification_url'] maxsize = settings['threaded.threads'] # sort of a lie, potentially timeout = settings['timeout'] if settings['use_ssl']: ca_certs = settings['ca_certs'] return urllib3.connection_from_url(url, maxsize=maxsize, timeout=timeout, cert_reqs='CERT_REQUIRED', ca_certs=ca_certs) return urllib3.connection_from_url(url, maxsize=maxsize, timeout=timeout)
def test_same_url(self): # Convince ourselves that normally we don't get the same object conn1 = connection_from_url('http://localhost:8081/foo') conn2 = connection_from_url('http://localhost:8081/bar') self.assertNotEqual(conn1, conn2) # Now try again using the PoolManager p = PoolManager(1) conn1 = p.connection_from_url('http://localhost:8081/foo') conn2 = p.connection_from_url('http://localhost:8081/bar') self.assertEqual(conn1, conn2)
def nl_fetch_url(service_vars=None, method='POST', service=nl_service): if service == nl_service: http = urllib3.connection_from_url('https://www.nganluong.vn/') else: http = urllib3.connection_from_url('http://exu.vn/') fetch_data = http.request(method, service, service_vars) try: if fetch_data.status == 200: return fetch_data.data else: raise Exception(fetch_data.status) except Exception as ex: print "Error: " + str(ex) return None
def get_con_pool(host, key_file=None, cert_file=None, socket_timeout=15.0, max_pool_size=3, verify_https=True): """ Return a ConnectionPool instance of given host :param socket_timeout: socket timeout for each connection in seconds """ kwargs = { "timeout": socket_timeout, "maxsize": max_pool_size, "block": True, } if key_file is not None and cert_file is not None: kwargs["key_file"] = key_file kwargs["cert_file"] = cert_file if urisplit(host).scheme == "https": kwargs["ssl_version"] = ssl.PROTOCOL_TLSv1 if verify_https: kwargs["cert_reqs"] = "CERT_REQUIRED" kwargs["ca_certs"] = getattr(settings, "RESTCLIENTS_CA_BUNDLE", "/etc/ssl/certs/ca-bundle.crt") return connection_from_url(host, **kwargs)
def url_get(url): url_part = parse_url(url) UrlObj = Url(url_part.scheme, url_part.auth, url_part.host, url_part.port, url_part.path or None, url_part.query or None, url_part.fragment or None) conn = urllib3.connection_from_url(url) response = conn.request('GET', UrlObj.request_uri) return response
def get_html(self, ticker): # timeout = Timeout(connect=4.0, read=10.0) http = urllib3.PoolManager() # using candle sticks url = f'''https://bigcharts.marketwatch.com/advchart/frames/frames.asp?show=&insttype=&symb={ticker}&x=39&y=12&time=7&startdate=1%2F4%2F1999&enddate=2%2F1%2F2021&freq=1&compidx=aaaaa%3A0&comptemptext=&comp=none&ma=5&maval=20+40&uf=0&lf=4&lf2=32&lf3=1&type=4&style=320&size=2&timeFrameToggle=false&compareToToggle=false&indicatorsToggle=false&chartStyleToggle=false&state=11''' http_pool = "" web_page = "" html = "null" status_code = 400 try: http_pool = urllib3.connection_from_url(url) web_page = http_pool.urlopen('GET', url) status_code = web_page.status except Exception as e: print(f'\n[-] ERROR: URL: {e}\n') print(f'[@] status-code: {status_code}') if status_code == 200: html = web_page.data.decode('utf-8') return html
def download_category(self): url = 'http://shop.hansalim.or.kr/im/main.do' http_pool = urllib3.connection_from_url(url) r = http_pool.urlopen('GET',url) page = r.data.decode('utf-8') # parse the html using beautiful soap and store in variable `soup` soup = BeautifulSoup(page, 'html.parser') # Take out the <div> of name and get its value img_all = soup.find_all('img',class_='second_category_menu_btn_img') second_category_list = [(img.get('id'),img.get('alt')) for img in img_all] for second_category in second_category_list: li_all = soup.find_all('li',class_='third_category_menu_btn',id=re.compile(second_category[0].replace('second','third').replace('img','menu'))) # third_category_list = [(a.get('href')[-6:],a.string) for li in li_all # for a in li if a.string != '\n' # ] third_category_list = [a.get('href')[-6:] for li in li_all for a in li if a.string != '\n' ] # self.categories[(third_category_list[0][0][0:4],second_category[1])]=third_category_list self.categories[third_category_list[0][0:4]]=third_category_list print(self.categories) # sys.setrecursionlimit(50000) with open(os.path.join(self.root_dir,'category.pickle'),'wb') as out_file: # Pickle the 'data' dictionary using the highest protocol available. # object로 저장시 recursionError 발생, object -> str pickle.dump(self.categories, out_file,-1)#pickle.HIGHEST_PROTOCOL) print('download complete..')
def find_lurersport(self): url = 'https://lurer.com' site_url = 'https://lurer.com/?cat=16&l=am' data_list = [] http_pool = urllib3.connection_from_url(site_url) request = http_pool.urlopen('GET', site_url) html = request.data.decode('utf-8') soup = bs4(html, 'html.parser') arg_main = soup.find_all("div", {'class': 'mainSilver clearfix'}) arg_second = bs4(str(arg_main), 'html.parser') arg_last = arg_second.find_all( 'div', {'class': 'mainCenterWrapperLeft clearfix'}) arg = bs4(str(arg_last), 'html.parser') arg_back = arg.find_all('div', {'class': 'catBox clearfix'}) arg_second = bs4(str(arg_back), 'html.parser') arg = arg_second.find_all('a', href=True) for ar in arg: if ar.text: txt = ar.text[22:] txt = txt.strip() link = url + str(ar['href']) if txt != '': data_list.append([link, txt]) return data_list
def getcontent(): pages.drop() for link in links.find(): url = link['url'] id = link['_id'] try: http_pool = urllib3.connection_from_url(url) r = http_pool.urlopen('GET', url) html = r.data.decode('utf-8') #print(html) soup = BeautifulSoup(html, "html5lib") head = soup.find('head') body = soup.find('body') json_html = { "url_link": id, "url": url, "html": html, "head": head.encode(), "body": body.encode() } pages.insert_one(json_html) except: print("Unexpected error:", sys.exc_info()[0]) #return number of pages retrieved return pages.count()
def __init__(self, url, name, user=None, passwd=None): Thread.__init__(self) self.session = load_session() self.should_stop = Event() self.url = url self.path = urlsplit(url).path headers = {} if user is not None: if passwd is None: passwd = "" headers['Authorization'] = "Basic " + base64.encodestring("%s:%s" % (user, passwd))[:-1] self.http_pool = urllib3.connection_from_url(self.url, headers=headers) self.composite = None self.composite_bg = None self.composite_start = time() s = self.session q = s.query(SensorGroup).filter_by(name=name) if(q.count() != 1): raise SensorNotFoundError("Invalid sensor group name") self.group = q[0] q = s.query(Sensor).filter_by(group=self.group) if(q.count < 1): raise SensorNotFoundError("Sensor group contains no image sensors") self.sensor = q[0]
def create_code(submission): url = "http://codeforces.com/contest/" + str( submission['contestId']) + "/submission/" + str(submission['id']) if (flags['proxy'] == 1): http = urllib3.proxy_from_url(proxyDict['http']) else: http = urllib3.connection_from_url(url) handle = http.request('GET', url) html_gunk = handle.data #print(html_gunk) soup = BeautifulSoup(html_gunk, 'html.parser') #subprocess.call(["touch", './source-code/' + str(submission['id']) + ".cpp"]) fi3 = open('./source-code/' + str(submission['id']) + ".cpp", "w") fi3.write("//Language: " + str(submission['programmingLanguage']) + "\n\n\n") try: result = soup.pre.get_text().encode('utf-8', errors='replace').decode('utf-8') except AttributeError: result = bs4_error_text except UnicodeDecodeError: result = '<CHAR>' except UnicodeEncodeError: result = '<CHAR>' fi3.write(result) fi3.close()
def __init__(self, APIKey, Secret,tovalue): self.APIKey = str(APIKey) self.Secret = str(Secret) self.toValue = tovalue self.nonce = int(time.time()) self.http_pool = connection_from_url('https://btc-e.com') self.table = ConversionTable(self.getMarketsGraph())
def is_connected(): try: conn = urllib3.connection_from_url('http://irinn.ir/', timeout=1) conn.request('GET', '/') return True except: return False
def getcontent(): pages.drop() for link in links.find(): url = link['url'] id = link['_id'] try: http_pool = urllib3.connection_from_url(url) r = http_pool.urlopen('GET',url) html = r.data.decode('utf-8') #print(html) soup = BeautifulSoup(html, "html5lib") head = soup.find('head') body = soup.find('body') json_html = { "url_link": id, "url": url, "html": html, "head": head.encode(), "body": body.encode() } pages.insert_one(json_html) except: print("Unexpected error:", sys.exc_info()[0]) #return number of pages retrieved return pages.count()
def _connection_from_urlinfo(urlinfo): key = '%s://%s:%s/' % (urlinfo.scheme, urlinfo.hostname, urlinfo.port) pool = pool_cache.get(key, None) if not pool: pool = urllib3.connection_from_url(urlinfo.geturl()) pool_cache[key] = pool return pool
def get_new_hbr(): hbr_url_init = "https://hbr.org/" url = "https://hbr.org/topic/data" http_pool = urllib3.connection_from_url(url) r = http_pool.urlopen('GET', url) soup = BeautifulSoup(r.data.decode('utf-8'), 'html.parser') items = soup.findAll('div', attrs={'class': 'row'}) df = pd.DataFrame(columns=["date", "type", "title", "url", "abstract"]) for i in progressbar.progressbar(items): try: title = i.find('h3', attrs={'class': 'hed'}) url = hbr_url_init + title.find('a').get("href") title_text = title.text.replace("\t", "").replace("\n", "") type = i.find('span', attrs={'class': 'content-type'}) type_text = type.text.replace("\t", "").replace("\n", "") date_ul = i.find( 'ul', attrs={'class': 'stream-utility plain-inline-list'}) date_li = date_ul.find('li', attrs={'class': 'utility pubdate'}) date_text = date_li.text.replace("\t", "").replace("\n", "") paragraph = getParagraph(url) abstract = textRankAlgorithm(paragraph) df = df.append( { 'date': date_text, 'type': type_text, 'title': title_text, 'url': url, 'abstract': abstract }, ignore_index=True) except: pass return df
def download_from_file(link_list_file, output_dir): conn = None with open(link_list_file, 'r') as links_file: for link_line in links_file.readlines(): link = link_line.strip() filename = os.path.join(output_dir, link.split('/')[-1]) if not filename.endswith('.zip'): filename += '.zip' if not os.path.isfile(filename): if conn is None: conn = urllib3.connection_from_url(link) path = urlparse.urlparse(link).path response = conn.request('GET', path, headers={'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'User-Agent': 'crate-weather/0.0.1'}, timeout=10) if response.status >= 400: logger.error("error accessing {0}: {1}".format(link, response.data)) else: with open(filename, 'wb') as output: for data in response.stream(): output.write(data) response.release_conn() logger.info("downloaded {0}".format(os.path.basename(filename))) else: logger.info("{0} already downloaded".format(os.path.basename(filename)))
def __init__(self, ServerHost=False, User=False, Password=False): self.Config = PyJedoxWebConfig() self.useProxy = USE_PROXY self.ProxyUrl = self.Config.getProxyUrl() self.ServerHost = ServerHost if ServerHost else self.Config.getJedoxHost( ) self.ServerPort = self.Config.getJedoxPort() self.ServerRoot = "http://%s:%s/" % (self.ServerHost, self.ServerPort) # depreciated: urllib.FancyURLopener({'http': self.ProxyUrl}) if self.useProxy else # # To verify if SSL is enabled, try: # >>> import socket # >>> socket.ssl # <function ssl at 0x4038b0> # #try: # import socket # socket.ssl #except ImportError: # print("error: no ssl support") # self.Client = urllib3.connection_from_url(self.ServerRoot) # get_url -> request(request(method, url, fields=None, headers=None, **urlopen_kw) self.getUrlResult = self.Client.request('GET', '/') self.User = self.Config.getJedoxUser() self.Password = self.Config.getJedoxPassword() self.UrlEncoder = urllib.parse.urlencode self.__DBList = {}
def get_new_mckinsey(): mckinsey_url_init = "https://www.mckinsey.com" mckinsey_types = ["Article", "Interview", "Commentary", "DiscussionPaper"] url = "https://www.mckinsey.com/business-functions/mckinsey-analytics/our-insights" http_pool = urllib3.connection_from_url(url) r = http_pool.urlopen('GET', url) soup = BeautifulSoup(r.data.decode('utf-8'), 'html.parser') items = soup.findAll('div', attrs={'class': "item"}) df = pd.DataFrame(columns=["date", "type", "title", "url", "abstract"]) for i in progressbar.progressbar(items): new = i.find('span', attrs={'class': 'eyebrow'}) new_type = new.text.replace("\t", "").replace("\n", "").replace(" ", "").split("-")[0] if new_type in mckinsey_types: text_wrapper = i.find('div', attrs={'class': 'text-wrapper'}) date = text_wrapper.find('div', attrs={'class': 'description'}) date_text = date.find('time').text.replace("\t", "").replace("\n", "") title = text_wrapper.find('a') title_text = title.text.replace("\t", "").replace("\n", "") url = mckinsey_url_init + title.get('href') paragraph = getParagraph(url) abstract = textRankAlgorithm(paragraph) df = df.append( { 'date': date_text, 'type': new_type, 'title': title_text, 'url': url, 'abstract': abstract }, ignore_index=True) return df
def get(): file = open("/home/pi/nas/Content/Init/JC_Website_Reference.txt", "r") websites = file.readlines() file.close() url = random.choice(websites).rstrip() print(url) http_pool = urllib3.connection_from_url(url) html = http_pool.urlopen('GET', url).data soup = BeautifulSoup(html, features="html.parser") rows = soup.findAll('p') output = [] for d in rows[1:]: text = d.text.split("</span>")[0] if text not in ["", " ", "\n"]: prox = text.split(" ") count = 0 lineCount = 0 line = "" for word in prox: if count < 55: count += len(word) + 1 line += word + " " else: lineCount += 1 output.append(line) count = 0 line = "" if line != "": output.append(line) heading = output[0] return output
def upload(): upload_url = "http://127.0.0.1:8080/upload" url = urllib3.util.parse_url(upload_url) cb_url = url.request_uri if url.port is not None: server = "%s:%d"%(url.host, url.port) else: server = url.host conn = urllib3.connection_from_url(server) headers = urllib3.make_headers(keep_alive=True) content = "hello world" response = conn.urlopen("POST", cb_url, body=content, headers=headers) if response.status != 200: print "eeeeeeeeeeee" sys.exit(1) else: print response.getheaders() print response.read() print response.data fileid = json.loads(response.data)["fileid"] path = "/download?fileid=%d"%fileid print "download path:", path response = conn.urlopen("GET", path, headers=headers) if response.status != 200: print "download fail" sys.exit(1) else: print response.data
def getConceptDerivedTerms(word): searchTerm = word link = 'http://conceptnet.io/c/en/' link = link + searchTerm http_pool = url.connection_from_url(link) r = http_pool.urlopen('GET', link) http_pool.close() html = r.data.decode('utf-8') soup = b(html, features="html5lib") divs = soup.findAll("a") div = [] candies = [] for d in divs: if d.contents[0] == 'Derived terms': div = d.find_parent().find_parent() if len(div) > 0: links = div.findAll("a") for k in links: candies.append(n.word_tokenize(k.contents[0])) del (candies[0]) c = [] for k in candies: if len(k) > 1: counter = 0 s = '' for j in k: if len(j) > 2: counter += 1 s = s + ' ' + j if counter == len(k): c.append(s) elif len(k[0]) > 2: c.append(k[0]) candies = c c = [] for k in candies: if not k == searchTerm: c.append(k) candies = c for k in range(len(candies)): temp = n.word_tokenize(candies[k]) if len(temp) > 1: s = '' for j in temp: s = s + j + ' ' candies[k] = s else: candies[k] = temp[0] return candies
def __init__(self, uri, poolmanager=None, username='******', password='******', **kwargs): """Constructor for the AICLib object. Arguments: uri -- the address of the nvp controller including scheme (required) Keyword arguments: poolmanager -- a pool manager provided by urlib3 (default None) username -- the username to log into the nvp controller password -- the password to log into the nvp controller """ if poolmanager is None: self.conn = urllib3.connection_from_url(uri) else: self.conn = poolmanager.connection_from_url(uri) self.connection = Connection(connection=self.conn, username=username, password=password, **kwargs)
def __init__(self, ServerHost=False, User=False, Password=False): self.Config = PyJedoxWebConfig() self.useProxy = USE_PROXY self.ProxyUrl = self.Config.getProxyUrl() self.ServerHost = ServerHost if ServerHost else self.Config.getJedoxHost( ) self.ServerPort = self.Config.getJedoxPort() self.ServerRoot = "http://%s:%s/" % (self.ServerHost, self.ServerPort) # depreciated: urllib.FancyURLopener({'http': self.ProxyUrl}) if self.useProxy else # # To verify if SSL is enabled, try: # >>> import socket # >>> socket.ssl # <function ssl at 0x4038b0> # #try: # import socket # socket.ssl #except ImportError: # print "error: no ssl support" # # self.Client = httplib.HTTPConnection(self.ServerRoot) self.Client = urllib3.connection_from_url(self.ServerRoot) # httplib # self.Client.request("GET", "/") # r1 = self.Client..getresponse() self.getUrlResult = self.Client.get_url self.User = self.Config.getJedoxUser() self.Password = self.Config.getJedoxPassword() self.UrlEncoder = urllib.urlencode self.__DBList = {}
def openUrl(self, url): conn = urllib3.connection_from_url(url) if self.USER is not None: authHeaders = urllib3.util.make_headers(basic_auth="%s:%s" % (self.USER, self.PASSWORD)) return conn.request("GET", url, headers=authHeaders).data else: return conn.get_url("GET", url).data
def openUrl(self,url): conn = urllib3.connection_from_url(url) if self.USER is not None: base64string = base64.encodestring('{0}:{1}'.format(self.USER, self.PASSWORD)).replace('\n', '') return conn.get_url(url, headers={"Authorization","Basic {0}".format(base64string)}).data else: return conn.get_url(url).data
def read_exif(files, source_folder): with exiftool.ExifTool() as et: #j = et.get_metadata('/'.join([source_folder, files[0]])) f = ['/'.join([source_folder, j]) for j in files] metadata = et.get_metadata_batch(f) for d in metadata: print(d["SourceFile"]) #[(k,d[k]) for k in d.keys() if str(d[k]).lower().find('g') >= 0] conn = urllib3.connection_from_url(progress_url, maxsize=1, headers={'Content-Type': 'application/json'}) myjson = {'folder': source_folder, 'file': d.get("SourceFile", None), 'focal_length': d.get(u'EXIF:FocalLengthIn35mmFormat', None), 'apeture': d.get(u'EXIF:FNumber', None), 'ISO': d.get(u"EXIF:ISO", None), 'shutter': d.get(u'EXIF:ExposureTime', None), # 'raw_json': json.dumps(d) } # update status conn.urlopen('POST', '/update', body=json.dumps(myjson), headers={'connection': 'keep-alive', 'Content-Type': 'application/json'})
def getHTML(): """ For each of the urls in query_duplications, get the HTML and store it in the query_articles collection """ db = sets.getDB() for item in db.query_duplications.find(): url = item['url'] # if the article doesn't already exist, get the html if db.query_articles.findOne({"url": url}).count() == 0: try: # Get raw html http_pool = urllib3.connection_from_url(url) r = http_pool.urlopen('GET', url) raw_html = r.data.decode('utf-8') # Get soup soup = BeautifulSoup(raw_html, 'html5lib') head = soup.find('head') body = soup.find('body') db.query_articles.insert({ "url" : url, "raw_html" : raw_html, "head" : head, "body" : body }) except: print("ERROR: ", sys.exc_info()[0]) print("Finished retrieving HTML")
def getStats(Id): url = "https://www.strava.com/athletes/" + str(Id) http_pool = urllib3.connection_from_url(url) r = http_pool.urlopen('GET', url) soup = BeautifulSoup(r.data.decode('utf-8'), 'html.parser') table = soup.table t = [] for child in table.children: t.append(child) tbody = t[3] tr = [] for child in tbody.children: if child != '\n': tr.append(child) final = [] for r in tr: for string in r.strings: if string != '\n': final.append(repr(string)) print(final) Distance = final[1][1:-1] + ' mi ' Time = final[4][1:-1] + ':' + final[6][2:-1] + ' ' Elevation = final[9][1:-1] + ' ft ' Runs = final[12][1:-1] print(Distance + Time + Elevation + Runs) return [ final[1][1:-1], final[4][1:-1], final[6][2:-1], final[9][1:-1], final[12][1:-1] ]
def __init__(self, url, name, user=None, passwd=None): Thread.__init__(self) self.session = load_session() self.should_stop = Event() self.url = url self.path = urlsplit(url).path headers = {} if user is not None: if passwd is None: passwd = "" headers['Authorization'] = "Basic " + base64.encodestring( "%s:%s" % (user, passwd))[:-1] self.http_pool = urllib3.connection_from_url(self.url, headers=headers) self.composite = None self.composite_bg = None self.composite_start = time() s = self.session q = s.query(SensorGroup).filter_by(name=name) if (q.count() != 1): raise SensorNotFoundError("Invalid sensor group name") self.group = q[0] q = s.query(Sensor).filter_by(group=self.group) if (q.count < 1): raise SensorNotFoundError("Sensor group contains no image sensors") self.sensor = q[0]
def getStatic(host, path=''): not_visited = ['/'] links = ['/'] http = urllib3.connection_from_url(host) while not_visited: url = not_visited.pop() if url.split('.')[-1] not in non_html_suffix: request = http.request('GET', url, headers=headers) response = request.data.decode('unicode_escape') if url == '/': url = 'index' if url[0] == '/': # remove slash for both side url = url[1:] if url[-1] == '/': url = url[:-1] if '/' in url: dir = path + '/'.join(url.split('/')[:-1]) if not os.path.exists(dir): os.makedirs(dir) target_url = path + url + '.html' target_url = re.sub( r'\?', '-', target_url ) # '?' is invalid for file name. "?" may appear on url with open(target_url, "w", encoding='utf-8') as file: file.write(localize(response, url)) print(url + '.html finished') pattern = re.compile(r"href=\"(.*?)\"") for candidate_url in pattern.findall(response): candidate_url = re.sub(r'#.*', '', candidate_url) if candidate_url and 'http' not in candidate_url and 'https' not in candidate_url and candidate_url not in links: not_visited.append(candidate_url) links.append(candidate_url)
def find_tert(self): url = 'https://www.tert.am' site_url = 'https://www.tert.am/am/' data_list = [] http_pool = urllib3.connection_from_url(site_url) request = http_pool.urlopen('GET', site_url) html = request.data.decode('utf-8') soup = bs4(html, 'html.parser') arg_main = soup.find_all("div", {'class': 'inner-content clear-fix'}) arg_second = bs4(str(arg_main), 'html.parser') arg_last = arg_second.find_all('div', {'class': 'inner-content__inner-right'}) arg = bs4(str(arg_last), 'html.parser') arg_back = arg.find_all('div', {'class': 'tab-wrapper'}) arg_second = bs4(str(arg_back), 'html.parser') arg = arg_second.find_all('a', href=True) for ar in arg: if ar.text: txt = ar.text txt = txt.strip() link = url + str(ar['href']) if txt != 'Իրադարձային' and txt != '' and txt != 'Սպորտ' and txt != 'Իրավունք': if txt != 'Քաղաքականություն' and txt != 'Մամուլի տեսություն' and txt != 'Ժամանց': data_list.append([link, txt]) return data_list
def __init__(self, distro=None, release=None, cachedir=None, secure=True): if cachedir: self.cachedir = cachedir os.makedirs(self.cachedir, exist_ok=True) self._parts = {'region':self.region, 'distro':distro, 'release':release} base = distro if not distro.startswith('http'): base = self._urls[distro] elif not distro.endswith('/'): base = base+'/' self.baseurl = base%self._parts self._pool = connection_from_url(self.baseurl) self._etag = {} self._content = {} release = self.get('Release') if secure: release_gpg = self.get('Release.gpg') gpg_verify(release, release_gpg) else: _log.warn('Skipping signature check of RELEASE') release = self.release = Release(release) self.codename = release['Codename'] self.archs = set(release['Architectures'].split()) self.components = set(release['Components'].split()) info = proc_release(release) self._top = self.Manifest(self, info, '')
def get_con_pool(host, key_file=None, cert_file=None, socket_timeout=15.0, max_pool_size=3, verify_https=True): """ Return a ConnectionPool instance of given host :param socket_timeout: socket timeout for each connection in seconds """ kwargs = { "timeout": socket_timeout, "maxsize": max_pool_size, "block": True, } if key_file is not None and cert_file is not None: kwargs["key_file"] = key_file kwargs["cert_file"] = cert_file if urlparse(host).scheme == "https": kwargs["ssl_version"] = ssl.PROTOCOL_TLSv1 if verify_https: kwargs["cert_reqs"] = "CERT_REQUIRED" kwargs["ca_certs"] = getattr(settings, "RESTCLIENTS_CA_BUNDLE", "/etc/ssl/certs/ca-bundle.crt") return connection_from_url(host, **kwargs)
def __init__(self): super(MovieCrawlerUVK, self).__init__(cadena=u"UVK Multicines",tag="UVK") self.url = r"""http://www.uvkmulticines.com""" self.encoding = 'utf-8' # indicadores de subtitulos # (indicador también del desorden cerebral de los encargados del website de UVK) self.suffix_subtitles['doblada'] = [ u'(doblada Estreno)', u'(Doblada - Estreno)', u'(Estreno Doblada)', u'(Doblada Estreno)', u'(HD Doblada)', u'(Digital doblada)', u'(Doblada)', u'( Doblada)', ] self.suffix_subtitles['subtitulada'] = [ u'(subtitulada Estreno )', u'(Estreno Subtitulada)', u'(Subtitulada Estreno)', u'(Subtitulada - Estreno)', u'(HD Subtitulada)', u'(Digital subtitulada)', u'(subtitulada)', u'(Subtitulada)', u'( Subtitulada)', ] # indicadores de resolución self.suffix_resolutions['HD'] = [ u'(HD Doblada)', u'(Digital subtitulada)', u'(Digital doblada)', u'HD'] self.suffix_resolutions['3D'] = [ u'3D', ] self.suffix_discard = [ '(Estreno)', ] self.conn = urllib3.connection_from_url(self.url, timeout=self.timeout)
def __init__(self, username, password, inputfile, connection_pool_size=10): self.username = username self.password = password self.inputfile = inputfile self.sleep = 60 self.pool = urllib3.connection_from_url('https://atlas.ripe.net', maxsize=connection_pool_size) self.headers = [('User-agent', 'Mozilla/5.0'), ('Referer', 'https://atlas.ripe.net/atlas/udm.html'), ('Host', 'atlas.ripe.net'), ('Origin', 'https://atlas.ripe.net'), ('X-Requested-With', 'XMLHttpRequest')] self.login() self.target_dict = measure_baseclass.load_input(self.inputfile) """ self.target_list = [] f = open(self.inputfile) for line in f: line = line.strip() chunks = line.split() target = chunks[0] probes = chunks[1:] #if target in self.target_list: # sys.stderr.write('Already saw target %s\n' % target) # continue self.target_list.append((target, probes)) f.close() """ """
def __init__(self): super(MovieCrawlerCinerama, self).__init__(cadena=u"Cinerama", tag="CRAMA") self.url = r"http://www.cinerama.com.pe/" self.encoding = 'utf-8' self.conn = urllib3.connection_from_url(self.url, timeout = self.timeout)
def _getConnPool(self, layer, baseUrl): """Get a connection pool for the given layer id Each layer ID has it's own connection pool (as each connection pool handles only one base URL) and non-existent pools are automatically created once requested NOTE: connection pools reuse open connections :param str layerId: pool id (mapped to layer id) :param str baseUrl: a URL used to initialize the connection pool (basically just the domain name needs to be correct) """ pool = self.connPools.get(layer.id, None) if pool: return pool else: # create pool #headers = { 'User-Agent' : "Mozilla/5.0 (compatible; MSIE 5.5; Linux)" } userAgent = self.modrana.configs.user_agent headers = {'User-Agent': userAgent} connection_timeout = constants.TILE_DOWNLOAD_TIMEOUT if layer.connection_timeout is not None: # some value was set in the config if layer.connection_timeout < 0: # -1 == no timeout connection_timeout = None # None means no timeout for Urllib 3 connection pools else: connection_timeout = layer.connection_timeout if connection_timeout is None: self.log.debug("creating tile download pool for %s without a connection timeout", layer.id) else: self.log.debug("creating tile download pool for %s with connection timeout %s s", layer.id, connection_timeout) newPool = urllib3.connection_from_url(url=baseUrl, headers=headers, maxsize=10, timeout=connection_timeout, block=False) self.connPools[layer.id] = newPool return newPool
def get_con_pool(host, key_file=None, cert_file=None, ca_file=None, socket_timeout=15.0, max_pool_size=3, verify_https=True): """ Return a ConnectionPool instance of given host :param socket_timeout: socket timeout for each connection in seconds """ kwargs = { "timeout": socket_timeout, "maxsize": max_pool_size, "block": True, } if key_file is not None and cert_file is not None: kwargs["key_file"] = key_file kwargs["cert_file"] = cert_file if urlparse(host).scheme == "https": kwargs["ssl_version"] = ssl.PROTOCOL_TLSv1 if verify_https: kwargs["cert_reqs"] = "CERT_REQUIRED" kwargs["ca_certs"] = ca_file return connection_from_url(host, **kwargs)
def __init__(self, uri, analysis, schema, timeout, prefetch, http_op, num_pools): self.__uri=URI.URI(uri) if self.__uri.scheme!="http" and self.__uri.scheme!="https": raise Exception("Scheme '%s' is not currently supported" % uri.scheme) self.__host=str(self.__uri) self.__request=self.__uri.path self.__host=self.__host[:self.__host.find(self.__request)] self.__fields=dict(self.__uri.query) self.__analysisfunct=analysis self.__parser=etree.XMLParser(schema=schema) self.__timeout=timeout self.__prefetch=prefetch self.__http_op=http_op self.__httppool=urllib3.connection_from_url(self.__host, maxsize=num_pools) # AutoPageCount doesn't rely on earlier pages, so we can skip a thread pool if isinstance(self.__analysisfunct, AutoPageCount): self.__threadpool=None else: self.__threadpool=ThreadPool(1) self.__page=0 self.__data=[] for slot in xrange(0, self.__prefetch): self.__dataptr=slot self.__data.append(None) self.__fillData() # Wait for the first page in case it was a bad URL if self.__getData(0).status!=200: raise Exception("HTTP error %d (%s)" % (self.__data[0].status, self.__data[0].reason))
def __init__(self, uri, poolmanager=None, username='******', password='******', **kwargs): """Constructor for the AICLib object. Arguments: uri -- the address of the nvp controller including scheme (required) Keyword arguments: poolmanager -- a pool manager provided by urlib3 (default None) username -- the username to log into the nvp controller password -- the password to log into the nvp controller """ retries = kwargs.get("retries", 3) socket_options = \ urllib3.connection.HTTPConnection.default_socket_options + \ [(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1), ] if poolmanager is None: self.conn = urllib3.connection_from_url( uri, retries=retries, socket_options=socket_options) else: self.conn = poolmanager.connection_from_url( uri, retries=retries, socket_options=socket_options) self.connection = Connection(connection=self.conn, username=username, password=password, **kwargs)
def getpicurl(picname): # input: The name of a file uploaded on the iGEM 2016 Wiki-Server # # IMPORTANT: The picture has to be uploaded before running the script! # # picname=input('please paste the name of an uploaded iGEM-wiki file:\n') # correct picname for changes the iGEM-Server needs picname=picname.replace(':', '-') # define fix url for Wiki-Sever # url = 'http://2016.igem.org/File:Freiburg_%s' % picname # print('the url I looked for was:\n%s' %url) # get raw_html from url as specified here: # http://stackoverflow.com/questions/17257912/how-to-print-raw-html-string-using-urllib3 # http_pool = urllib3.connection_from_url(url) r = http_pool.urlopen('GET', url) raw_html=r.data.decode('utf-8') # initialise bs-object ' soup = BeautifulSoup(raw_html, 'html.parser') # find the href-link in an a-object in a div with id=file # try: serverlink = 'http://2016.igem.org'+soup.find(id='file').find('a').get('href') # return the link # return serverlink except: return None
def bench_urllib3_with_threads(): begin = time.time() pool = urllib3.connection_from_url(urls[0], maxsize=4) urls_queue = Queue.Queue() for url in urls: urls_queue.put(url) def download(): while True: try: url = urls_queue.get_nowait() except Queue.Empty: return pool.get_url(url) urls_queue.task_done() for i in range(4): threading.Thread(target=download).start() urls_queue.join() end = time.time() print "took %0.3f seconds" % (end - begin)
def __init__(self): super(MovieCrawlerCMP, self).__init__(cadena=u"Cinemark", tag="CMP") self.url = r"""http://www.cinemark-peru.com""" # indicadores de subtitulos self.suffix_subtitles['doblada'] = [ u'(DOB)', u'(DOB', ] self.suffix_subtitles['subtitluada'] = [ u'(SUB)', u'(SUB',] # indicadores de resolución self.suffix_resolutions['HD'] = [ u'XD', u'XD 3D' ] self.suffix_resolutions['3D'] = [ u'3D', ] # único de Cinemark... self.prefix_resolutions['HD'] = [ u'2D Digital', u'2D DIG'] self.suffix_discard = [ '2D', ] self.suffix_dbox = [ 'd-box', 'D-Box', 'D-box', 'D-BOX', 'DBOX', 'dbox', 'Dbox', ] # aunque su página diga 'utf-8', el servidor envía iso-8859-15 # self.encoding = 'iso-8859-15' self.encoding = 'utf-8' # self.suffix_resolutions = self.replace_strings_for_regex_in_dict_with_list(self.suffix_resolutions) # self.prefix_resolutions = self.replace_strings_for_regex_in_dict_with_list(self.prefix_resolutions) # self.suffix_subtitles = self.replace_strings_for_regex_in_dict_with_list(self.suffix_subtitles) # self.prefix_subtitles = self.replace_strings_for_regex_in_dict_with_list(self.prefix_subtitles) self.conn = urllib3.connection_from_url(self.url, timeout=self.timeout)
def __init__(self): user_agent = '''Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0''' self.headers = {'User_agent': user_agent} self.http_pool = urllib3.connection_from_url( "http://top.baidu.com",timeout = 15, maxsize = 5, headers = self.headers)
def post_message(message): try: url, run_id = get_config_val() pool = urllib3.connection_from_url(url) fields = {'run_id': run_id, 'data' : message} pool.post_url('/process_manager/notify', fields) except MaxRetryError: print("Message delivery error: " + message)
def get_url_data(site_url, params=None): # http_pool = AppEngineManager() r = requests.get(site_url, params=params) print(r.url) http_pool = urllib3.connection_from_url(r.url) r = http_pool.urlopen('GET', r.url) soup = BeautifulSoup(r.data, 'lxml') return soup
def pool_get(url_list): assert url_list pool = urllib3.connection_from_url(url_list[0]) for url in url_list: now = time.time() r = pool.get_url(url) elapsed = time.time() - now print("Got in %0.3fs: %s" % (elapsed, url))
def read_data_base(url): http = urllib3.connection_from_url(url) r = http.urlopen('GET', url) data = r.data.decode("utf-8-sig").encode("utf-8") return data
def reload_index(self, **kw): self.conn = urllib3.connection_from_url(self.root, **kw) c = self.conn._get_conn() c.request('OPTIONS', '*', headers={'Accept': 'application/json'}) res = c.getresponse() body = res.read() self._index = json.loads(body) self.conn._put_conn(c)
def post_message(message): try: url, run_id = get_config_val() pool = urllib3.connection_from_url(url) fields = {'run_id': run_id, 'data': message} pool.post_url('/process_manager/notify', fields) except MaxRetryError: print("Message delivery error: " + message)