def acessarSite(url): try: open('https://www.google.com.br') except: print('Não há internet.') exit() try: open(url) except ValueError: print('O site é inválido') except URLError: print('O site não é acessível ou não existe.') else: print('O site está acessível.')
def _request(self, url: str, method: str = 'GET', payload: bytes = None, content_type: MimeType = None, accept: Sequence[MimeType] = None) -> Response: """General purpose request.""" request = Request(url=url, sni_hostname=self._sni_hostname, client_cert_path=self._client_cert_path, client_key_path=self._client_key_path, ca_cert_path=self._ca_cert_path, data=payload, method=method) if ((payload is not None) and content_type): request.add_header('Content-Type', str(content_type)) if (accept): request.add_header('Accept', ', '.join([str(mime_type) for mime_type in accept])) if (self.username and self.password): request.add_header('Authorization', 'Basic ' + base64.b64encode((self.username + ':' + self.password).encode('utf-8')).decode('ascii')) request.add_header('User-Agent', self.user_agent) try: with request.open() as response: return Response(response) except urllib.error.HTTPError as error: raise NetworkError(error.reason, error.code) except urllib.error.URLError as error: raise NetworkError(error.reason) except Exception as error: raise NetworkError(str(error))
def shares(year = None): url = backtest(year) cookie = http.cookiejar.CookieJar() request = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie)) request.addheaders = [ ('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'), ('Accept', 'text/html, text/plain, text/css, text/sgml, */*;q=0.01') ] # Aqui estão os parâmetros de busca das ações # Estão em branco para que retorne todas as disponíveis data = {'pl_min': '', 'pl_max': '', 'pvp_min': '', 'pvp_max' : '', 'psr_min': '', 'psr_max': '', 'divy_min': '', 'divy_max': '', 'pativos_min': '', 'pativos_max': '', 'pcapgiro_min': '', 'pcapgiro_max': '', 'pebit_min': '', 'pebit_max': '', 'fgrah_min': '', 'fgrah_max': '', 'firma_ebit_min': '', 'firma_ebit_max': '', 'margemebit_min': '', 'margemebit_max': '', 'margemliq_min': '', 'margemliq_max': '', 'liqcorr_min': '', 'liqcorr_max': '', 'roic_min': '', 'roic_max': '', 'roe_min': '', 'roe_max': '', 'liq_min': '', 'liq_max': '', 'patrim_min': '', 'patrim_max': '', 'divbruta_min': '', 'divbruta_max': '', 'tx_cresc_rec_min': '', 'tx_cresc_rec_max': '', 'setor': '', 'negociada': 'ON', 'ordem': '1', 'x': '28', 'y': '16'} with request.open(url, urllib.parse.urlencode(data).encode('UTF-8')) as link: content = link.read().decode('ISO-8859-1') pattern = re.compile('<table id="resultado".*</table>', re.DOTALL) content = re.findall(pattern, content)[0] page = fragment_fromstring(content) result = pandas.DataFrame(dataframe_opts(year)) for rows in page.xpath('tbody')[0].findall("tr"): new_row = pandas.DataFrame(index=[rows.getchildren()[0][0].getchildren()[0].text], data=dataframe_data(rows, year)) result = result.append(new_row) return result[result['Cotação'] > 0]
def e621_search(md5, username='******'): request = urllib.request.build_opener() request.addheaders = [('User-agent', 'Yiffdex/0.2a used by ' + username)] try: f = request.open("https://e621.net/post/show.json?md5=" + md5) if f is not None: data = json.loads(f.read()) info = {} info['tags'] = data['tags'].split(" ") info['author'] = ';'.join( data['artist']) if 'artist' in data else '' info['src'] = data['source'] if 'source' in data and data[ 'source'] is not None else '' return info except urllib.error.HTTPError: pass except urllib.error.URLError: pass except json.JSONDecodeError: pass return None
def find_link_content(link): page = 1 while True: new_link = "http://quotes.toscrape.com" + link + "page/" # print(new_link) new_link = new_link + str(page) print(new_link) sub_bs = open(new_link) sub_bs = BeautifulSoup(sub_bs, 'html.parser') quotes = sub_bs.select('div.row div.col-md-8 span.text') # 如果没有数据就退出 if len(quotes) == 0: break #名言 quotes = [quote.text.strip('“”') for quote in quotes] #作者 authors = sub_bs.select('small.author') authors = [author.text for author in authors] # 标签 tags_list = sub_bs.select('meta.keywords') tags_list = [tags.get('content') for tags in tags_list] # print(authors) # print(quotes) #print(tags_list) record_list = [] for i in range(len(quotes)): tags = tags_list[i] tags = tags.replace(',', ',') print(tags) record = [quotes[i], authors[i], tags] record_list.append(record) insert_into_mysql(record_list) page += 1
def goto1(): url = "http://ip-api.com/json/" response = open(url + ip) data = response.read() values = json.loads(data) status = values['status'] success = "success" lat = str(values['lat']) lon = str(values['lon']) a = lat + "," b = lon + "/" c = b + "data=!3m1!1e3?hl=en" location = a + c maps = "https://www.google.com/maps/search/" webbrowser.open(maps + location) print(" IP: " + values['query']) print(" Status: " + values['status']) print(" city: " + values['city']) print(" ISP: " + values['isp']) print(" latitude: " + lat) print(" longitude: " + lon) print(" country: " + values['country']) print(" region: " + values['regionName']) print(" city: " + values['city']) print(" zip: " + values['zip']) print(" AS: " + values['as']) if status == success: speak("sucessfully located") else: speak("cannot find the location,sir") goto2()
def find_top_ten(url): response = open(url) bs = BeautifulSoup(response, 'html.parser') tags = bs.select('span.tag-item a') top_ten_href = [tag.get('href') for tag in tags] top_ten_tag = [tag.text for tag in tags] #print(top_ten_href) #print(top_ten_tag) return top_ten_href
def test_scrape(): url = 'http://python.org' html = open(url).read() html_decoded = html.decode("utf-8") title_index = html_decoded.find("<title>") start_index = title_index + len("<title>") end_index = html_decoded.find("</title>") title = html_decoded[start_index: end_index] return title
def nameandcolor_scrape(): url = "http://olympus.realpython.org/profiles/dionysus" html = open(url).read().decode("utf-8") patterns = ["Name:" ,"Favorite Color: "] for string in patterns: start = html.find(string) + len(string) tag = start + html[start:].find("<") texts = html[start : tag] print (texts)
def scrape_work(): url = 'http://python.org' html = open(url).read() html_decoded = html.decode("utf-8") pattern = ("<title.*?>.*?</title.*?>") match_results = re.search(pattern, html_decoded, re.IGNORECASE) title = match_results.group() title = re.sub("<.*?>", "", title) # Removes HTML TAGS FROM TITLE return title
def logout(self): request = urllib.request.build_opener() request.addheaders = [('User-agent', 'Yiffdex/0.2a')] try: f = request.open(self.api_url + 'api_logout.php?sid=' + self.sid) except urllib.error.HTTPError: pass except urllib.error.URLError: pass
def login(self, username='******', password=''): request = urllib.request.build_opener() request.addheaders = [('User-agent', 'Yiffdex/0.2a'), ('Content-type', 'multipart/form-data')] # Login and get sid try: params = urllib.parse.urlencode({ 'username': username, 'password': password }).encode('ascii') f = request.open(self.api_url + 'api_login.php', params) if f is not None: data = json.loads(f.read()) self.sid = data['sid'] if 'sid' in data else '' except urllib.error.HTTPError: pass except urllib.error.URLError: pass except json.JSONDecodeError: pass if self.sid == '': print('Unable to login on inkbunny !') return False # Modify rating option try: params = urllib.parse.urlencode({ 'sid': self.sid, 'tag[2]': 'yes', 'tag[3]': 'yes', 'tag[4]': 'yes', 'tag[5]': 'yes' }).encode('ascii') f = request.open(self.api_url + 'api_userrating.php', params) except urllib.error.HTTPError: pass except urllib.error.URLError: pass return True
def get_files_links(self, urls): files = {} config = Config.get_config() debug = config.debug for url in urls: request = self.request.get_opener() with request.open(url) as f: data = f.read().decode("utf-8") files[url] = re.findall(r'href=[\'"]?([^\'" >]+)', data, re.UNICODE | re.MULTILINE) files[url] = files[url][1:] if debug: formatter.debug_message("Found files: {0} for urls {1}".format(files, urls)) return files
def __init__(self, request): """Set up a response object.""" self.error = False try: response = request.open() self.data = response.read() self.status_code = response.getcode() self.mimetype = response.info().get_content_type() except urllib.error.HTTPError as error: self.status_code = error.getcode() self.mimetype = error.info().get_content_type() self.data = error.read() self.error = True
def __request(self, url): try: request = urllib.request.build_opener() request.add_header = [ ('User-agent', 'matheushssrobot/site:matheushsoaress.wordpress.com') ] return request.open(url) except urllib.error.ContentTooShortError as e: print("Ocorreu um erro de Urllib: " + repr(e)) except urllib.error.HTTPError as e: print("Ocorreu um erro de Urllib: " + repr(e)) except urllib.error.URLError as e: print("Ocorreu um erro de Urllib: " + repr(e)) except ValueError as e: print(repr(e)) except Exception as e: print(repr(e)) print("Um erro desconhecido ocorreu no metodo Robot.__request")
from urllib.request import urlopen as open from bs4 import BeautifulSoup as bs soup = bs(open('https://www.naver.com/')) def find(Obj1, Obj2): return soup.find(Obj1, Obj2) find("div", {"class":"nanan"})
"https://www.deltakits.com/shop/windshield-repair-products/", "https://www.deltakits.com/shop/windshield-repair-products/page/2/", "https://www.deltakits.com/shop/windshield-repair-products/page/3/", "https://www.deltakits.com/shop/windshield-repair-products/page/4/", "https://www.deltakits.com/shop/windshield-repair-products/page/5/", "https://www.deltakits.com/shop/windshield-repair-products/page/6/", "https://www.deltakits.com/shop/windshield-repair-products/page/7/", "https://www.deltakits.com/shop/windshield-repair-products/page/8/", "https://www.deltakits.com/shop/windshield-repair-products/page/9/", "https://www.deltakits.com/shop/windshield-repair-products/page/10/", "https://www.deltakits.com/shop/windshield-repair-products/page/11/", "https://www.deltakits.com/shop/windshield-repair-products/page/15/" ] for url in urls: download = open(url) page_html = download.read() download.close() page_soup = soup(page_html, "html.parser") containers = page_soup.findAll("", {"class": "product-inner"}) for container in containers: kit_model = container.findAll("div", {"class": "cat-list"}) kit = kit_model[0].text.strip() kit_price = container.findAll("span", {"class": "woocommerce-Price-amount"}) price = kit_price[0].text.strip() kit_rating = container.findAll("div", {"class": "star-rating"}) rating = kit_rating[0].text.strip()
req = request.Request('http://javaweb.io/login') requestBody = parse.urlencode({ 'name': 'root', 'pass': '******', 'verifyCode': verifyCode, }) # 通过该cookie执行登录 with opener.open(req,data=bytes(requestBody,'utf_8')) as response: # 登录ok,因为该cookie已经具备登录凭证,可以进入主页 with opener.open('http://javaweb.io') as response: print(response.read().decode()) ---------------------------- urllib-HTTPResponse | ---------------------------- * Http响应对象,也就是 request.open() 后返回的对象 * 实例属性 status * HTTP状态码 reason * 信息 * 方法 bytes read() * 读取响应体 bytes readline() * 读取一行数据 int getcode() * 返回HTTP响应码
# import 를 통해 내,외부 라이브러리 사용 """ import urllib.request import bs4 url = "https://www.naver.com/" html = urllib.request.urlopen(url) bs4.BeautifulSoup(html) bsObj = bs4.BeautifulSoup(html) """ # 코드 줄이기 from urllib.request import urlopen as open from bs4 import BeautifulSoup as bs url = "https://www.naver.com/" bsObj = bs(open(url)) # function 활용 def find(para, para2): return bsObj.find(para, para2) ''' # 1. 전체 크롤링 # print(html.read()) 전체 읽기 # print(bsObj) bs4활용 전체 읽기 # 2. 일부 크롤링 ( 탑 메뉴단 ) # top_Obj = bsObj.find("div", {"class": "service_area"}) # print(top_Obj)
def beautifulcrape(): url = open("http://python.org").read().decode("utf-8") bowl = BS(url, "html.parser") return bowl
#!/usr/bin/python3 # -*- coding: utf-8 -*- from urllib.request import urlopen as open import re url = "http://quotes.toscrape.com/" response = open(url) html = response.read() html = html.decode("UTF-8") # 获取 10 个 名言 result = re.findall('<span class="text" itemprop="text">(.*)</span>', html) print(result) print(len(result)) true_quotes = [] for single_result in result: # strip 从两边开始搜寻,只要发现某个字符在当前这个方法的范围内,统统去掉 new_result = single_result.strip("“”.") true_quotes.append(new_result) # print(new_result) # 获取 10 个名言的作者 true_authors = [] authors = re.findall('<small class="author" itemprop="author">(.*)</small>', html) print(len(authors)) for author in authors:
dest='n', default=2000, type=str, help='Number of repositories per request') args = parser.parse_args() request = urllib.request.URLopener() request.addheader( 'Authorization', args.auth_header, ) last = '' repositories = [] while True: params = urllib.parse.urlencode({'n': 2000, 'last': last}) responce = request.open(args.catalog_url + '?' + params, ).read() responce_dict = json.loads(str(responce, 'utf-8')) if len(responce_dict['repositories']): last = responce_dict['repositories'][-1] repositories += responce_dict['repositories'] print('Last repo is: ' + last) else: break print(repositories)
def mechanicalscrape(): soup = open("http://python.org").read() bowl = MS.Browser() spoon = bowl.get(soup) return spoon
from urllib.request import urlopen as open #Customizable tags to search for url = input("what website would you like to scrape from? ") decode = input("What decoding method would you like to use? ") start = input("what starting position would you like to search for? ") end = input("what ending position would you like to search for? ") #opening website html = open(url) readfile = html.read().decode(f"{decode}") #gathering the indexes start_index = readfile.find(f"{start}") end_index = readfile.find(f"{end}") startpos = start_index + len(f"{start}") endpos = end_index + len(f"{end}") #extracting the information isScraping = readfile[startpos:endpos] print("Here is the information you requested!") print("--------------------------------------") print(isScraping)
from urllib.request import urlopen as open from tkinter import Tk import win32com.client as win32 u = open("http://n.sinaimg.cn/news/1_img/dfic/34fa2aa3/62/w1024h638/20190510/4350-hwsffzc2527481.jpg") x1= win32.gencache.EnsureDispatch('Excel.Application') ss = x1.Workbooks.Add() sh = ss.ActiveSheet x1.Visible = True sh.Cells(5,10).Value = u
import re import urllib.request as l city = input("Enter city name: ") url = "http://www.weather-forecast.com/locations/"+ city + "/forecasts/latest" data = l.open(url).read() data1 = data.decode("utf-8")
def search(self, md5): request = urllib.request.build_opener() request.addheaders = [('User-agent', 'Yiffdex/0.2a'), ('Content-type', 'multipart/form-data')] data = None # Search file try: params = urllib.parse.urlencode({ 'sid': self.sid, 'text': md5, 'md5': 'yes', 'submission_ids_only': 'yes', 'keywords': 'no' }).encode('ascii') f = request.open(self.api_url + 'api_search.php', params) if f is not None: data = json.loads(f.read()) except urllib.error.HTTPError: pass except urllib.error.URLError: pass except json.JSONDecodeError: pass # Not found if int(data['results_count_all']) == 0: return None # Get submission information try: params = urllib.parse.urlencode({ 'sid': self.sid, 'submission_ids': data['submissions'][0]['submission_id'] }).encode('ascii') f = request.open(self.api_url + 'api_submissions.php', params) if f is not None: data = json.loads(f.read()) except urllib.error.HTTPError: pass except urllib.error.URLError: pass except json.JSONDecodeError: pass # Not found (protection but not possible) if data['results_count'] == 0: return None info = {} info['tags'] = [ k['keyword_name'] for k in data['submissions'][0]['keywords'] ] info['author'] = data['submissions'][0]['username'] info['src'] = 'https://inkbunny.net/s/' + data['submissions'][0][ 'submission_id'] return info
from urllib.request import urlopen as open import json with open('https://eric.clst.org/assets/wiki/uploads/Stuff/gz_2010_us_040_00_5m.json') as response: states = json.load(response) import pandas as pd df = pd.read_csv("covid19_vaccinations_in_the_united_states.csv", dtype={"State": str}, skiprows=3) df2 = pd.read_csv("united_states_covid19_cases_and_deaths_by_state.csv", skiprows=3) df.rename(columns={"State/Territory/Federal Entity":"State"}, inplace=True) df2.rename(columns={"State/Territory/Federal Entity":"State"}, inplace=True) df.at[42, "State"] = "New York" df.drop(df.index[5], inplace=True) df.drop(df.index[9], inplace=True) df.drop(df.index[17], inplace=True) df.drop(df.index[25], inplace=True) df.drop(df.index[52], inplace=True) df2.loc[38, "Total Cases"] += df2.loc[39, "Total Cases"] df2.drop(df2.index[39], inplace=True) df2.drop(df2.index[44], inplace=True) df2.drop(df2.index[58], inplace=True) df["Total Cases"] = [total_case for total_case in df2["Total Cases"]] df["# Vaccinated/Total Cases"] = [vaccinated/total_case for total_case, vaccinated in zip(df["Total Cases"], df["Total Administered"])] import plotly.express as px fig = px.choropleth(df, geojson=states, locations="State", color="# Vaccinated/Total Cases", featureidkey="properties.NAME", color_continuous_scale="Bugn", range_color=(0, 1), scope="usa", labels={"Total Administered" : "Vaccines Administered"},