def scrap(self): url = 'https://finance.naver.com/item/sise_day.nhn?code={item}'.format(item=self.item) uo(url) #class = tah p11 soup = BeautifulSoup(uo(url), "html.parser") for i in soup.find_all(name="span", attrs=({"class":"tah"})): print(str(i.text))
def get_movie_list(substr): url = 'https://jsonmock.hackerrank.com/api/movies/search/' # opening connection reading the page uClient = uo(url) json_movie = uClient.read() uClient.close() # to work with json parsed_json = json.loads(json_movie) tot_page = parsed_json["total_pages"] # movie name to search search_movie = substr movie_list = parsed_json['data'] # creating a array name title title = [] for pageNo in range(0, tot_page): mod_url = 'https://jsonmock.hackerrank.com/api/movies/search/?Title=' + str( search_movie) + '&page=' + str(pageNo) uClient = uo(mod_url) json_searchMovie = uClient.read() uClient.close() #to work with json json_movie = json.loads(json_searchMovie) movie_list = json_movie['data'] for i in range(0, len(movie_list)): title.append(movie_list[i]['Title']) #print(movie_list[i]['Title']) return sorted(title)
def sc1(self): pg = r( homePage.url2, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36' }) pg = uo(pg) pg_ht = pg.read() pg.close() soup = sp(pg_ht, 'html.parser') logourl = '<img alt="Times of India" class="img-fluid" src="https://static.mediawire.in/brands/profilepic/1117/TOI%20Logo%20in%20Red%20Bakcground.jpg">' heads = [] links = [] imgs = [] logos = [] try: imgs1 = soup.find('div', { 'class': 'listing4 clearfix' }).find('ul').findAll('li') for i in imgs1: heads.append(i.find('span').find('a').text) links.append(i.find('span').find('a').get('href')) imgs.append(i.find('a').find('img').get('data-src')) logos.append(logourl) news = list(zip(imgs, heads, links, logos)) return news except: news = [] return news
def sp(self): # pg=r(url1,{'User-Agent':'Magic Browser'}) pg = r( sportPage.url1, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36' }) pg = uo(pg) pg_ht = pg.read() pg.close() soup = sp(pg_ht, 'html.parser') logourl = '<img class="img-fluid" src="https://indianexpress.com/wp-content/themes/indianexpress/images/indian-express-logo-n.svg" title="The Indian Express" alt="The Indian Express">' heads = [] links = [] imgs = [] logos = [] try: imgsl = soup.find('div', { 'class': 'nation' }).findAll('div', {'class': 'snaps'}) headsl = soup.find('div', { 'class': 'nation' }).findAll('h2', {'class': 'title'}) for i in imgsl: links.append(i.find('a').get('href')) logos.append(logourl) imgs.append(i.find('img').get('data-lazy-src')) for i in headsl: heads.append(i.find('a').text) news = list(zip(imgs, heads, links, logos)) return news except: news = [] return news
def tc2(self): pg = r( techPage.url3, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36' }) pg = uo(pg) pg_ht = pg.read() pg.close() soup = sp(pg_ht, 'html.parser') logourl = '<img class="img-fluid" src="https://cdn.gadgets360.com/gadgets360_logo.png" alt="Technology News" title="NDTV Gadgets 360">' heads = [] links = [] imgs = [] logos = [] try: imgsl = soup.find('div', { 'class': 'story_list row margin_b30' }).findAll('div', {'class': 'thumb'}) for i in imgsl: if i.find('img').get( 'src' ) == "https://gadgets.ndtv.com/static/icons/img_120n.png": imgs.append(i.find('img').get('data-original')) else: imgs.append(i.find('img').get('src')) links.append(i.find('a').get('href')) logos.append(logourl) heads.append(i.find('img').get('alt')) news = list(zip(imgs, heads, links, logos)) return news except: news = [] return news
def tc(self): pg = r( techPage.url1, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36' }) pg = uo(pg) pg_ht = pg.read() pg.close() soup = sp(pg_ht, 'html.parser') logourl = '<img class="img-fluid" src="https://indianexpress.com/wp-content/themes/indianexpress/images/indian-express-logo-n.svg" title="The Indian Express" alt="The Indian Express">' heads = [] links = [] imgs = [] logos = [] try: imgsl = soup.find('ul', {'class': 'article-list'}).findAll('li') for i in imgsl: imgs.append(i.find('img').get('src')) links.append(i.find('a').get('href')) logos.append(logourl) heads.append(i.find('img').get('alt')) news = list(zip(imgs, heads, links, logos)) return news except: news = [] return news
def ec(self): # pg=r(url1,{'User-Agent':'Magic Browser'}) pg = r( ecoPage.url1, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36' }) pg = uo(pg) pg_ht = pg.read() pg.close() soup = sp(pg_ht, 'html.parser') logourl = '<img class="img-fluid" src="https://www.financialexpress.com/wp-content/themes/vip/financialexpress/assets/images/fe-logo-with-read-to-lead.svg" alt="Financial Express">' heads = [] links = [] imgs = [] logos = [] try: imgl1 = soup.find('div', {'class': 'leftcol'}).findAll('figure') titles1 = soup.find('div', {'class': 'leftcol'}).findAll('h2') titles2 = soup.find('div', {'class': 'leftcol'}).findAll('h3') for i in imgl1: imgs.append(i.find('img').get('data-src')) links.append(i.find('a').get('href')) logos.append(logourl) for i in titles1: heads.append(i.find('a').text) for i in titles2: heads.append(i.find('a').text) news = zip(imgs, heads, links, logos) return news except: news = [] return news
def fetch_url(url): try: web_client = uo(url) if web_client.getcode() == 504: raise ConnectionError('gateway timeout') web_page = web_client.read() except urllib.error.HTTPError as e: raise ConnectionError('http error occurred') except urllib.error.URLError as e: raise ConnectionError('http error occurred') else: web_client.close() return web_page
def scraping (): global TREND_DATA uClient = uo(main_url) forum_page = uClient.read() uClient.close() pageSoup = soup(forum_page, "html.parser") trendListContainer = pageSoup.findAll("div",{"class":"trend-card"}) for i in trendListContainer : trendTime = i.find("h5").text trendList = i.findAll("li") for j in trendList : TREND_DATA.append(j.text) print("=========MOMMENT TRENDING SCRAPE==========") FindMostTrending()
def fetch_words(url): """ Fetch a list of words from a URL Args: url: The URL of a UTF-8 text document Returns: story_words: A list of words """ story = uo(url) story_words = [] for line in story: line_words = line.decode('utf8').split() for word in line_words: story_words.append(word) story.close() return story_words
def getURL(page): global containerAmount containerAmount = 0 for counter in range(page): my_url = 'https://store.steampowered.com/search/?specials=1&page=' + str(counter) print(counter) # open up the connection, grab the web page and basically download uPage = uo(my_url) my_html = uPage.read() # dump everything out of the website uPage.close() # html parsing global pageParse pageParse = soup(my_html, 'html.parser') #grab each product global myContainers myContainers = pageParse.find_all("div", {'class', 'responsive_search_name_combined'}) # grab one game total information containerAmount +=len(myContainers) # tota amount of games in the current page getPNamePriceReview() getPlatform() getRating()
def tc1(self): pg = r( techPage.url2, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36' }) pg = uo(pg) pg_ht = pg.read() pg.close() soup = sp(pg_ht, 'html.parser') logourl = '<img src="https://akm-img-a-in.tosshub.com/indiatoday/../sites/all/themes/itg/logo.png?v=1.3" alt="India Today" class="img-fluid">' heads = [] links = [] imgs = [] logos = [] try: imgs1 = soup.find('div', { 'class': 'view-content' }).findAll('div', {'class': 'catagory-listing'}) for i in imgs1: imgs.append( i.find('div', { 'class': 'pic' }).find('img').get('src')) heads.append(i.find('div', {'class': 'detail'}).find('a').text) links.append( i.find('div', { 'class': 'detail' }).find('a').get('href')) logos.append(logourl) news = list(zip(imgs, heads, links, logos)) return news except: news = [] return news
def sc2(self): pg = r( homePage.url3, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36' }) pg = uo(pg) pg_ht = pg.read() pg.close() soup = sp(pg_ht, 'html.parser') logourl = '<img src="https://www.cs.utah.edu/~deb/assets/images/media/logo_it.png" alt="India Today" class="img-fluid">' heads = [] links = [] imgs = [] logos = [] try: imgs1 = soup.find('div', { 'class': 'view-content' }).findAll('div', {'class': 'catagory-listing'}) for i in imgs1: imgs.append( i.find('div', { 'class': 'pic' }).find('img').get('src')) heads.append(i.find('div', {'class': 'detail'}).find('a').text) links.append( i.find('div', { 'class': 'detail' }).find('a').get('href')) logos.append(logourl) news = list(zip(imgs, heads, links, logos)) return news except: news = [] return news
import bs4 from bs4 import BeautifulSoup as soup import lxml from urllib.request import urlopen as uo # Declaring the url and getting the full webpage using urllib. url = input("Enter a link to the topic's WikiPedia page : ") uclient = uo(url) page_html = uclient.read() uclient.close() #Using beautiful soup to parse the html page into required form page_soup = soup(page_html, "html.parser") # Finding all paragraphs in "bodyContent" div's of the "content" div. datadump = page_soup.body.find("div", { 'id': 'content' }).find("div", { 'id': 'bodyContent' }).find_all("p") #filtering out all the reference tags in the wikipedia page for udd in datadump: m = udd.find_all("sup") if m != None: for i in m: i.string = "" # Printing the collected info. for dd in datadump: onlyTextDump = dd.get_text() print(onlyTextDump)
import sys # 不推荐的写法 import sys, os # 正确的写法 from subprocess import Popen, PIPE from urllib.request import * # 正确的写法 from urllib.request import urlopen # 给引入的模块或者函数取别名 # import numpy as np from urllib.request import urlopen as uo uo("http://www.baidu.com") """ 常用内置模块 https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000/0014319347182373b696e637cc04430b8ee2d548ca1b36d000 """ """ 安装第三方模块 https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000/001432002680493d1babda364904ca0a6e28374498d59a7000 """ print("---------------------------1-----------------------------") for x in "huangbo": print(x) ddd = {'x': 'A', 'y': 'B', 'z': 'C'}
# 直接执行实例对象可获取解析结果列表 def __call__(self): self.feed(self.html) return self.data def handle_starttag(self, tag, attrs): if (not self.static_res) and (tag != 'a'): return prop = ('href', 'src') for attr in attrs: if attr[0] in prop: link = attr[1] if link.startswith('mailto:') or link.startswith( 'javascript:'): return # 跳过mailto和javascript链接 seek = link.find('#') if seek != -1: link = link[:seek - len(link)] # 去掉锚链接 if link: # 过滤空链接 if self.url: link = urljoin(self.url, link) # 合并到html页面url while link.endswith('/'): link = link[:-1] self.data.append(link) if __name__ == '__main__': url = 'http://www.baidu.com' from urllib.request import urlopen as uo parser = AnchorParser(uo(url).read(), url, static_res=True) print(parser())
from urllib.request import urlopen as uo from bs4 import BeautifulSoup as bs link = 'http://results.vtu.ac.in/results17/result_page.php?usn=1rn14cs' t = link #.csv file writer f = open("Res.csv", "w") f.write("NAME" + "," + "USN" + "," + "MARKS" + "\n") #link = link + '078' for i in range(0, 400): link = link + str(i).zfill(3) print(link) results_client = uo(link) results_html = results_client.read() results_client.close() r_soup = bs(results_html, "html.parser") try: usn = r_soup.findAll("div", {"class": "col-md-12"})[3].findAll( "td", {"style": "padding-left:15px;text-transform:uppercase" })[0].text.replace(":", "").strip() name = r_soup.findAll("div", {"class": "col-md-12"})[3].findAll( "td", {"style": "padding-left:15px"})[0].text.replace(":", "").strip() marks = r_soup.findAll( "table", { "style": "margin-left:30px;margin-bottom:5px;font-family:Times New Roman;font-size:12pt;" })[0].findAll("td", {"style": "padding-left:10px"})[0].b.text.replace( ":", "").strip()
def get_tables(url): return [to_df(t) for t in bs(uo(url), 'html.parser').find_all('table')]
from urllib.request import urlopen as uo from bs4 import BeautifulSoup as soup # make scraper appear to website as a legitimate browser headers = { 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Methods': 'GET', 'Access-Control-Allow-Headers': 'Content-Type', 'Access-Control-Max-Age': '3600', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0' } url = "http://lottoresults.co.nz/lotto/archive" # open connections, read NZ Lotto archive webpage, close connection client = uo(url) webpage = client.read() client.close() # html parsing webpage_soup = soup(webpage, "html.parser") # grabs url for every month in every year and appends to an array extract_months = webpage_soup.findAll("ul", {"class":""}) all_months = str(extract_months) end_urls = [] i = 1 for month in all_months.split('"'): if (i % 2 == 0): end_urls.append(month) i+=1
from urllib.request import urlopen as uo with uo('http://sixty-north.com/c/t.txt') as story: words = [] for line in story:
Start_Row= 1 #Call Ticker Loop Ticker_Found= loop_through_excel(Start_Row, Total_Rows) #Creates Panda with Ticker Symbols and Names from an Excel Spreadsheet. Ticker_List= 'Ticker_List.xlsx' Tickers= pd.read_excel(Ticker_List, header=0, index_col=False, keep_default_na=True) #Searches, Parses, and then locates Market Cap from Yahoo Finance. #html = uo('https://finance.yahoo.com/quote/AAPL/key-statistics?p=AAPL') for i in Tickers: html = uo('https://finance.yahoo.com/quote/'+ i + '/key-statistics?p=' + i + "'") print(i) read= bs(html.read(),'html.parser') MarketCap= read.find('td', {'class':'Fz(s) Fw(500) Ta(end) Pstart(10px) Miw(60px)'}) print(MarketCap.get_text()) #Prints Top 5 Dataframe results and Market Cap from Yahoo Finance. print(Tickers.head())
from bs4 import BeautifulSoup from urllib.request import Request as R, urlopen as uo url = "http://synd.cricbuzz.com/j2me/1.0/livematches.xml" h = {'User-Agent': ''} req = R(url, headers=h) resp = uo(req) xml = BeautifulSoup(resp, "xml") for val in xml.find("mchdata"): print(val)
import bs4 from urllib.request import urlopen as uo from bs4 import BeautifulSoup as soup my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38' # my_url = 'https://twitter.com/picoiyer' uCLient = uo(my_url) page_html = uCLient.read() uCLient.close() # parsing html here page_soup = soup(page_html, 'html.parser') #grab parts of page containers = page_soup.findAll("div", {"class": "item-container"}) # print(page_soup.p, page_soup.h1) # print(containers[0]) for container in containers: # print(container) brand = container.div.div.a.img["title"] title_container = container.findAll('a', {"class": "item-title"}) # print(title_container) product_name = title_container[0].text shipping = container.findAll('li', {"class": "price-ship"})[0].text.strip() print("Brand: {}".format(brand)) print("product Name: {}".format(product_name)) print("Shipping: {}".format(shipping))