def parsePrice(): r = requests.get('yahoo url') soup = bs4.beautifulsoup(r.text, "xml") price = soup.find_all( 'div', {'class': 'My(6px) Pos(r) smartphone_Mt(6px)'})[0].find('span').text return price
def get(num, overwrite): euler_url = 'http://projecteuler.net/' url = '{0}problem={1}'.format(euler_url, num) r = requests.get(url) soup = beautifulsoup(r.content) c = soup.find_all(id='content')[0] problem_text = [] resource_file = None for x in c.find_all('sup'): x.string = '^' + ''.join(list(x.stripped_strings)) for x in c.find_all('sub'): x.string = '_' + ''.join(list(x.stripped_strings)) for x in c.find_all('div', class_='problem_content'): if x.find_all('a'): resource_file = '{0}{1}'.format( euler_url, x.find_all('a', href=True)[0]['href']) problem_text.append(' '.join(list(x.stripped_strings))) problem_text = '\n'.join(textwrap.wrap('\n'.join(problem_text))) problem_text = problem_text.replace(' ^', '^') problem_text = problem_text.replace('^ ', '^') problem_text = problem_text.replace(' _', '_') problem_text = problem_text.replace('_ ', '_') problem_name = (c.find_all('h2')[0].string .lower() .replace(' ', '_') .replace('-', '_')) print(num) print(problem_name, end='\n\n') print(problem_text) if resource_file: resource_file_name = './data/{0:03d}_{1}.txt'.format(num, problem_name) make_file(resource_file_name, requests.get(resource_file).content, executable=False) resource = ("\nwith open('{0}', 'r') as f:\n DATA = f.readlines()\n\n" ''.format(resource_file_name)) else: resource = '' t = Template(TEMPLATE) s = t.safe_substitute(number=num, text=problem_text, resource=resource) file_name = './{0:03d}_{1}.py'.format(num, problem_name) if not os.path.isfile(file_name): make_file(file_name, s) else: if overwrite: make_file(file_name, s) else: print('\n{0} already exists'.format(file_name))
def get_container(product): global URL tokens = nltk.word_tokenize(product) for token in tokens: URL += token + '%20' page = requests.get(URL) soup = beautifulsoup(page.content, features="html.parser") print(soup.prettify()) card_grid = soup.find("div", id="card_grid", recursive=True) return card_grid
def set_title_and_meta(self,htmltext): 'before fetch feature,clear the html file' soup=beautifulsoup(htmltext.decode('utf-8')) comments = soup.findAll(text=lambda text:isinstance(text, Comment)) [comment.extract() for comment in comments] [style.extract() for style in soup.findAll('style')] [script.extract() for script in soup.findAll('script')] [form.extract() for form in soup.findAll('form')] [table.extract() for table in soup.findAll('table')] self.set_title(soup) self.set_meta(soup) htmldoc=re.sub('<[^>]*>','',soup.body.prettify()).encode('utf-8') if soup.body else None self.set_permit_and_content(htmldoc)
def set_title_and_meta(self, htmltext): 'before fetch feature,clear the html file' soup = beautifulsoup(htmltext.decode('utf-8')) comments = soup.findAll(text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] [style.extract() for style in soup.findAll('style')] [script.extract() for script in soup.findAll('script')] [form.extract() for form in soup.findAll('form')] [table.extract() for table in soup.findAll('table')] self.set_title(soup) self.set_meta(soup) htmldoc = re.sub( '<[^>]*>', '', soup.body.prettify()).encode('utf-8') if soup.body else None self.set_permit_and_content(htmldoc)
def get_events(): """Get the events from the webpage and format them for use in Google Calendar.""" content = get_content() events = [] for page in content: month = page["month"].month year = page["month"].year soup = beautifulsoup(page["content"], "lxml") agenda_items = soup.find_all('ul', class_='agendaitems', attrs={}) # The calendar page has two columns for events. for column in agenda_items: items = column.find_all('li') # Each event is a list. for item in items: title = item.find("span", class_='kopje').get_text() url = CREA_MORE_INFO_ADDRESS.format(item.find_all("a", href=True)[0]['href']) description = item.find('span', class_='tekst').get_text() item_datetime = item.find('em', class_="datum").get_text().split('|') dates = ['-'.join([year, month, a[1]]) for a in [d.strip().split(' ') for d in item_datetime[:-1]]] # Gracefully handle events that occur on multiple days. for date in dates: start_time = item_datetime[-1].strip() + ':00' event_start = date + 'T' + start_time event_end = (datetime.datetime.strptime(event_start, '%Y-%m-%dT%H:%M:%S') + datetime.timedelta(hours=2)).strftime('%Y-%m-%dT%H:%M:%S') # Hashes are used as a unique identifier to prevent the same event from being added twice. event_hash = hashlib.sha224(str(title + url + date + start_time + description).encode('utf-8')).hexdigest() event = {"summary": title, "start": { "dateTime": event_start, "timeZone": "Europe/Amsterdam" }, "end": { "dateTime": event_end, "timeZone": "Europe/Amsterdam" }, "description": url + description, "id": event_hash} events.append(event) return events
def md5_decrypt(encrypt_string): headers_somd5 = { 'x-requested-with': 'XMLHttpRequest', 'Accept-Language': 'zh-cn', 'Referer': 'http://www.somd5.com/', 'Accept': '*/*', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept-Encoding': 'gzip, deflate', 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)', 'Host': 'www.somd5.com', 'Proxy-Connection': 'Keep-Alive', 'Pragma': 'no-cache' } url_somd5 = 'http://www.somd5.com/somd5-index-md5.html' data = 'isajax=QoG29V7X6mEGHt6Ep8pTI43&md5='+encrypt_string cj = cookiejar.CookieJar() opener = request.build_opener(request.HTTPCookieProcessor(cj)) req1 = opener.open(url_somd5) resp1 = req1.read() for a in cj: cookie_name = a.name cookie_value = a.value length = 35+len(encrypt_string) data_encode = data.encode() req2 = request.Request(url=url_somd5, data = data_encode, headers = headers_somd5,\ method = 'POST') req2.add_header('Content-Length',length) req2.add_header('Cookie',cookie_name+':'+cookie_value) resp2 = request.urlopen(req2) resp2 = resp2.read(1000) try: resp2_decode = resp2.decode(encoding = 'gb18030') except: resp2_decode = resp2.decode(encoding = 'utf-8') bs = beautifulsoup(resp2_decode) print('开始解密...') for b in bs.find_all('h1'): text = b.get_text() print('解密后:{}'.format(text)) sys.exit() print('解密失败了...')
def main(): global f_w t = datetime.datetime.now() t_format = t.strftime('%Y-%m-%d-%H-%M-%S')#输出时间格式 f_name = 'cirt.net-passwords-' + t_format + '.txt' f_w = open(f_name, 'w+') req = request.urlopen(url) resp = req.read() resp_decode = resp.decode() #print(resp_decode) bs = beautifulsoup(resp_decode) print('\n starting to process...\n') for a in bs.find_all('tr'):#这个是用于处理页面厂商名称的,用于下面获取厂商页面的默认密码 #print(a) #格式为 <tr><td><a href="?vendor=Huawei Technologies Co">Huawei Technologies Co</a></td><td><a href="?vendor=Hyperic, Inc.">Hyperic, Inc.</a></td><td><a href="?vendor=IBM">IBM</a></td></tr> for b in a.find_all('a'): #print('\n') #print(b.get_text())#打印出厂商名称 Huawei Technologies Co cs_name = b.get_text() changshang.append(cs_name) print(changshang) length_cs = len(changshang) print('cs len is :{}'.format(length_cs)) for i in range(0, length_cs, 10):#使用多线程 k = length_cs -i if k>10: k = 10 else: k = k for j in range(k):#使用多少(k)个线程, 输出时需要价格锁Lock,濡染会很乱 j = i+j t = threading.Thread(target = huoqu, args = (changshang[j],)) threads.append(t) for m in range(len(threads)): threads[m].start() for n in range(len(threads)): threads[n].join() threads.clear() print('工作完成,我要睡一会了...') sleep(6) f_w.close()
def main(): global f_w t = datetime.datetime.now() t_format = t.strftime('%Y-%m-%d-%H-%M-%S') #输出时间格式 f_name = 'cirt.net-passwords-' + t_format + '.txt' f_w = open(f_name, 'w+') req = request.urlopen(url) resp = req.read() resp_decode = resp.decode() #print(resp_decode) bs = beautifulsoup(resp_decode) print('\n starting to process...\n') for a in bs.find_all('tr'): #这个是用于处理页面厂商名称的,用于下面获取厂商页面的默认密码 #print(a) #格式为 <tr><td><a href="?vendor=Huawei Technologies Co">Huawei Technologies Co</a></td><td><a href="?vendor=Hyperic, Inc.">Hyperic, Inc.</a></td><td><a href="?vendor=IBM">IBM</a></td></tr> for b in a.find_all('a'): #print('\n') #print(b.get_text())#打印出厂商名称 Huawei Technologies Co cs_name = b.get_text() changshang.append(cs_name) print(changshang) length_cs = len(changshang) print('cs len is :{}'.format(length_cs)) for i in range(0, length_cs, 10): #使用多线程 k = length_cs - i if k > 10: k = 10 else: k = k for j in range(k): #使用多少(k)个线程, 输出时需要价格锁Lock,濡染会很乱 j = i + j t = threading.Thread(target=huoqu, args=(changshang[j], )) threads.append(t) for m in range(len(threads)): threads[m].start() for n in range(len(threads)): threads[n].join() threads.clear() print('工作完成,我要睡一会了...') sleep(6) f_w.close()
def meiju(k): global flag, count#这个也要再次声明为global,不然就会当成局部变量 try: data = url + '/?author=' + str(k) req = request.urlopen(data, timeout = 20)#网络太烂设置的超时时间长一些 #print('starting to read...') resp = req.read(2000) #print(resp) data_decode = resp.decode(encoding = 'utf-8') #sleep(4) #print('\n\n\nafter decode : {}'.format(data_decode)) data_soup = beautifulsoup(data_decode, from_encoding='utf-8') #print(data_soup) for a in data_soup.find_all('title'): #print(a)#<title>kelz - FreeBuf.COM</title> b = a.get_text().strip()#由于freebuf的格式为 #<title> # sn0rt - FreeBuf.COM</title>需要去除空格 变为#sn0rt - FreeBuf.COM b = b.split(' ') #以-分割 [sn0rt, FreeBuf.COM] #具体分隔符还要看具体的站配置,一般用户名后面都有空格 #print(b[0]) #就打印出sn0rt了 b = b[0].strip()#进一步去除空格 不然在用户名后存在空格 print('用户{0}:{1}'.format(k, b))#a.get_text()))#kelz - FreeBuf.COM f_w.write('用户'+str(k)+': '+b+'\n')#k是int型, 写入的是字符型 except Exception as e: excepts.append(k) length = len(excepts) if length > 3:#试了相邻两个相减差值为1,判断用户结束 也可以 excepts.sort()#类似冒泡排序,这个自带的函数 for i in range(1,length): if i+1 < length: if (excepts[i]-excepts[i-1] == 1) and (excepts[i+1] - excepts[i] == 1): #print(excepts[i-1], excepts[i], excepts[i+1]) #实际由于多线程不一定相邻相减,有任意两个相减为1 count.append(excepts[i-1]) flag = False if len(count)>3: count = list(set(count)) print('count is : {}'.format(count)) else: print('{0} error is : {1}'.format(k, e))
def huoqu(huoqu_url): #打开一个厂商默认用户密码的页面 url_hq = cs_url + parse.quote(huoqu_url) #得到具体页面url,需要quote编码,不然会出错 print(url_hq) requ = request.urlopen(url_hq) response = requ.read() response_decode = response.decode() bs_huoqu = beautifulsoup(response_decode) print('\n厂商名称:{}'.format(huoqu_url)) f_w.write('厂商名称:{}'.format(huoqu_url) + '\n') lock = threading.Lock() #加入线程锁,防止争抢资源,一个一个的输出不会乱 lock.acquire() for c in bs_huoqu.find_all('tr'): result = re.search(patt_href, c.get_text()) if result is not None: pass else: result_title = re.match('\d+', c.get_text()) #标题直接输出,不用像下面那样分割 flag = False if result_title is None: #不知道啥原因,只能有一行语句,无奈只好返回标志 flag = True else: print(c.get_text()) f_w.write(c.get_text() + '\n') if flag: #用于输出格式好看点, username : password d = c.get_text('\t: ') split_v = d.split(':') v1 = split_v[0] #User ID 或者PASSWORD v1 = v1.strip('\t') length = len(v1) if length == 8: print(c.get_text(' : ')) f_w.write(c.get_text(' : ') + '\n') else: print(c.get_text('\t : ')) f_w.write(c.get_text('\t : ') + '\n') print('\n\n') f_w.write('\n') #每个厂商之间隔一行 lock.release()
def huoqu(huoqu_url):#打开一个厂商默认用户密码的页面 url_hq = cs_url+parse.quote(huoqu_url)#得到具体页面url,需要quote编码,不然会出错 print(url_hq) requ = request.urlopen(url_hq) response = requ.read() response_decode = response.decode() bs_huoqu = beautifulsoup(response_decode) print('\n厂商名称:{}'.format(huoqu_url)) f_w.write('厂商名称:{}'.format(huoqu_url) + '\n') lock = threading.Lock()#加入线程锁,防止争抢资源,一个一个的输出不会乱 lock.acquire() for c in bs_huoqu.find_all('tr'): result = re.search(patt_href, c.get_text()) if result is not None: pass else: result_title = re.match('\d+', c.get_text())#标题直接输出,不用像下面那样分割 flag = False if result_title is None:#不知道啥原因,只能有一行语句,无奈只好返回标志 flag = True else: print(c.get_text()) f_w.write(c.get_text() + '\n') if flag:#用于输出格式好看点, username : password d = c.get_text('\t: ') split_v = d.split(':') v1 = split_v[0]#User ID 或者PASSWORD v1 = v1.strip('\t') length = len(v1) if length == 8: print(c.get_text(' : ')) f_w.write(c.get_text(' : ') + '\n') else: print(c.get_text('\t : ')) f_w.write(c.get_text('\t : ') + '\n') print('\n\n') f_w.write('\n')#每个厂商之间隔一行 lock.release()
import bs4 as bs import urllib.request import sqlite3 #baglanti=sqlite3.connect("ornek.db") #isaretci=baglanti.cursor() #tablo=isaretci.execute('''CREATE TABLE haberler''') kaynak=urllib.request.urlopen("http://www.milliyet.com").read() sayfa=bs.beautifulsoup(kaynak,'lxml') tablo=isaretci.execute('''CREATE TABLE linkler(id INTEGER PRIMARY KEY,link VARCHAR(255))''') for nav in sayfa.findAll('a'): isaretci.execute('''INSERT INTO linkler(link)VALUES(2)''',([nav.get('href'))) print(nav.get('href')) print(sayfa.title.string) print(sayfa.findAll('p')) #p etiketi olan şeylerbuluyor for paragraf in sayfa.findAll('p'): #isaretci.execute('''INSERT INTO haberler(haber)VALUES(?)''',(paragraf.string)) print(paragraf.string) sonuc=isaretci.execute("SELECT*FROM linkler") print(sonuc.fetchall())
def createParser(self, rawHtml): self.parser = beautifulsoup(rawHtml, 'html.parser')
def __init__(self, rawHtml): self.parser = beautifulsoup(rawHtml, 'html.parser')
import bs4 from urllib import request res = request.get('http://nostarch.com') res.raise_for_status() nostarchsoup = bs4.beautifulsoup(res.text) type(nostarchsoup)
def get_page(url): return beautifulsoup(GET(url))
import requests, bs4, openpyxl #open excel and identify sheet to dump data into wb = openpyxl.load_from_workbook('Book1.xlsx') sheet = wb.get_sheet_by_name('Sheet1') #identify website and obtain html document for web page url_str = 'website.com' res = requests.get(url_str) res.raise_for_status() #checks for html error codes when accessing page htmldoc = bs4.beautifulsoup(res.text) #identify selectors and put them into a list elements_list htmldoc.select('.question-summary narrow') #selects the class ('.' indicats html type class) #for each instance in elements_list, add the title to a MS excel file i=2 for each instance in elements_list: sheet.cell(row=i,column=1).value = instance.get_text() sheet.cell(row=i,column=2).value = instance.get('href') i += 1 #save the workbook and close wb.save('Book1.xlsx') wb.close()
class Connection(object): def __init__(self, username=None, password=None, verify=False): self.headers = {'User-Agent': 'Mozilla/5.0'} self.session = requests.Session() self.verify = verify if username and password: self.base_uri = '://www.nsof.class.noaa.gov/saa/products/' self.authenticate = Auth(username, password) self.get('welcome') self.translator = Translator() self.authenticate.do(self) self.request = Request(self) self.subscribe = Subscribe(self) else: self.base_uri = '://www.nsof.class.noaa.gov' def next_up_datetime(self): end = datetime.utcnow() self.get('') middle = self.last_response_soup.select('#middle p') if len(middle) > 0: text = middle[1].text regex = re.compile(", (.*), from (.*) UTC .* through (.*) UTC") params = list(regex.findall(text)[0]) pattern = '%m/%d/%y %H%M' begin = datetime.strptime('%s %s' % tuple(params[0:2]), pattern) end = datetime.strptime('%s %s' % (params[0], params[2]), pattern) if begin >= end: end += timedelta(days=1) from pytz import utc return end.replace(tzinfo=utc) @property def cookies(self): self._cookies = requests.utils.cookiejar_from_dict( requests.utils.dict_from_cookiejar(self.session.cookies)) return self._cookies @property def last_response(self): return self._last_response @last_response.setter def last_response(self, response): packed = self.pack(response).select('h1') if (response.status_code != requests.codes.ok or (packed and 'An Error Occurred' in packed[0].text)): raise Exception('Connection error (%i).' % response.status_code) self._last_response = response @property def last_response_soup(self): return self.pack(self.last_response) def get(self, url, proto='http'): """ Load an url using the GET method. Keyword arguments: url -- the Universal Resource Location proto -- the protocol (default 'http') """ self.last_response = self.session.get(proto + self.base_uri + url, headers=self.headers, cookies=self.cookies, allow_redirects=True, verify=self.verify) return self.last_response_soup def pack(self, response, async=False): soup = beautifulsoup(response.text) if async: response.close() return soup
import requests vnexpress = requests.get("https://vnexpress.net/") print(vnexpress) file_name = "vnexpress.html" file_html = open(file_name, "wb") file_html.write(vnexpress.content) file_html.close() decoded_content = open_file.read().decoded('utf-8') from bs4 import beautifulsoup trangweb_express = beautifulsoup(decoded_content, "html.parser") print(trangweb_vnexpress.find("div", attrs = {"class":"scroll-pane"}))
def table(self): page = requests.get('http://www.star.nesdis.noaa.gov/smcd/spb/fwu/' 'homepage/GOES_Imager_Vis_OpCal.php') pq = beautifulsoup(page.text) return (pq.select("table")[2]).select("tr")
import requests from bs4 import beautifulsoup import json fungame = [] url="https://www.miniclip.com/games/en/" page=requests.get(url).content soup=beautifulsoup(page,'html.parser') games=soup.find_all('article',class_='slick-slide') for game in games: if 'Play_code' in game['class']: continue picture=game.find('img').findAll(itemprop="image") mygame=('picture':picture) fungme.append(mygame) #print (games[0].prettify()) with open('data.json','w')as outfile: json.dump(fungame.outfile)
def response_text(): url = 'https://www.gismeteo.ru/weather-yaroslavl-4313/month/' return requests.get(url, headers={ 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) \ AppleWebKit/537.36 (KHTML, like Gecko)\ Chrome/84.0.4147.89 Safari/537.36' }).text def get_bs4_elements_tag(search_location, name_tag, class_tag): return search_location.findAll(name_tag, class_=class_tag) soup = beautifulsoup(response_text(), 'lxml') div_tooltip_cell = get_bs4_elements_tag(soup, 'div', 'tooltip cell') div_tooltip_cell_hover = get_bs4_elements_tag(soup, 'div', 'tooltip cell _hover') def get_bs4_element_tag(search_location, name_tag, class_tag): return search_location.find(name_tag, class_=class_tag) def parse_date(bs4_element_tag): result = get_bs4_element_tag(bs4_element_tag, 'div', 'date') if len(result.contents) > 1: return result.contents[0].contents[0].string else: return result.contents[0].string
#try: a = int(input("enter a number")) b = int(input("enter another number")) ans = a/b print(ans) except ZeroDivisionError as ex: print("you cannot divide a number by 0") except KeyboardInterrupt as key: print("\nprogram exited by user") #bs4 from bs4 import beautifulsoup import requests try: page = requests.get("https://twitter.com/adhikaritaiwan") soup = beautifulsoup(page.content, 'html.parser') all_item = soup.find_all(class_="content") for content in all_item: tweet_text = content.find(class_="js-tweet-text-contains") print(tweet_text) except Exception as ex: print("Couldn't connect to twitter")