def search_wxc_cooking(url): global pages_wxc soup=get_soup_obj_ff('http://bbs.wenxuecity.com'+url) # print('ref stop='+'http://bbs.wenxuecity.com'+url) foodLinks=soup.find_all("a",href=re.compile("^(./1)")) for foodLink in foodLinks: if 'title' in foodLink.attrs: # print(foodLink.attrs['title']) # print(foodLink.text) if "四川" in foodLink.attrs['title']: print(foodLink.attrs['title']) print('Author='+foodLink.parent.find("a", {"class": "b"}).text) #print('Author=' + foodLink.parent.find("a", id=re.compile("^(n_cooking_)")).__str__()) #for child in foodLink.parent.find(""): #if 'text' in child.attrs: #print(child.__str__()) pageLinks = soup.find_all("a", href=re.compile("^(/cooking/\?page)")) for pageLink in pageLinks: # print(pageLink.attrs['href']) if pageLink.attrs['href'] not in pages_wxc: pages_wxc.add(pageLink.attrs['href']) print('New page found = '+pageLink.attrs['href']) search_wxc_cooking(pageLink.attrs['href'])
def openHRef(url, urls, result): if (url in urls): return urls.append(url) bsObj = urlToBeautifulSoup(url) container = bsObj.find('div', {'id': 'mw-pages'}) nextPage = container.find('a', {'href': re.compile("/w/.*#mw-pages")}) anchors = container.findAll('a', {'href': re.compile("/wiki/.*")}) for a in anchors: if a.has_attr('title') and a['title'] not in result: wikilink = '{}{}'.format(wikipedia, a['href']) bsWikilink = urlToBeautifulSoup(wikilink) wikidataUrl = bsWikilink.find( 'a', {'href': re.compile('https://www.wikidata.org/wiki/Q')}) if (wikidataUrl is None): continue wikidataHref = wikidataUrl['href'] bsWikidata = urlToBeautifulSoup(wikidataHref) data = getData(bsWikidata) qid = 'Q{}'.format(wikidataHref.split('Q')[1]) result[a['title']] = { 'qid': qid, 'title': a['title'], 'href': wikilink, 'data': data } if nextPage is not None: openHRef('https://es.wikipedia.org{}'.format(nextPage['href']), urls, result)
def crawl_rating(URL): movies_list = crawl_moveek(URL) for i in range(len(movies_list)): movie = movies_list[i] soup = get_URL("https://www.google.com/search?q=" + movie["title"]) try: movie["imdb_URL"] = soup.find( href=re.compile("imdb"))["href"].strip("/url?q=").split("&")[0] movie["rotten_URL"] = soup.find(href=re.compile( "rotten"))["href"].strip("/url?q=").split("&")[0] except: pass return movies_list
def test_reg_expression(): url='http://www.pythonscraping.com/pages/page3.html' soup=get_soup_obj(url) # search <img src="../img/gifts/img3.jpg"> images=soup.find_all("img", {"src": re.compile("\.\./img/gifts/img.\.jpg")}) for image in images: print(image["src"]) gifts=soup.find_all("tr",{"id": re.compile("gift.")}) for gift in gifts: print(gift["id"]) # test Lambda expressions # retrieves all tags that have exactly two attributes tags = soup.find_all(lambda tag: len(tag.attrs) == 2) for tag in tags: print(tag.attrs)
def getData(bs): statementHeader = bs.find(id='claims').parent divContainer = statementHeader.find_next_sibling() divData = divContainer.findAll('div', {'id': re.compile('P[0-9]*')}) properties = {} for data in divData: aProperty = data.find('a', {'title': re.compile('Property:P[0-9]*')}) propertyId = aProperty['title'].split(':')[1] propertyName = aProperty.get_text() if 'image' in propertyName: aValue = [data.find('img')] else: aValue = data.findAll( 'div', { 'class': 'wikibase-snakview-value wikibase-snakview-variation-valuesnak' }) values = {} for a in aValue: if (a.name == 'img'): qValue = 'img' textValue = 'https:{}'.format(a['src']) else: # this is a div if len(a.find_all()) > 0: innerDataTitle = a.find_all( 'a', {'title': re.compile('Q[0-9]*')}) for idata in innerDataTitle: if 'ikipedia' in idata.get_text(): continue qValue = idata['title'] textValue = idata.get_text() innerDataLink = a.find_all( 'a', {'class': re.compile('external free')}) for idata in innerDataLink: if 'ikipedia' in idata.get_text(): continue qValue = 'link' textValue = idata.get_text() else: qValue = 'value' textValue = a.get_text() values[qValue] = textValue properties[propertyId] = {} properties[propertyId]['label'] = propertyName properties[propertyId]['values'] = values return properties
def verify(url_add, writer): current_link = url_add next_link = "" last_link = "" while True: html = urlopen(current_link) bsObj = BeautifulSoup(html, "html.parser") pager_tags = bsObj.findAll("div", {"class": "Pager"}) for pages_tag in pager_tags: if pages_tag: page_tags = pages_tag.findAll( "a", {"href": re.compile("\/Shop-[a-zA-Z]{6}\/[0-9]*\/[a-zA-Z0-9]*\?page=[0-9]*")} ) if page_tags: next_page_tag = page_tags[-2] last_page_tag = page_tags[-1] next_link = "http://www.chemistwarehouse.com.au" + next_page_tag["href"] last_link = "http://www.chemistwarehouse.com.au" + last_page_tag["href"] save_single_page(current_link, writer) if next_link == last_link: save_single_page(next_link, writer) break else: current_link = next_link next_link = ""
def searchCodeAThon(): searches = [] page_start = raw_input("Enter start page: ") page_end = raw_input("Enter end page: ") num_users = raw_input("Enter number of searches: ") for num in range(0, int(num_users)): search_term = raw_input("Please enter search term: ") searches.append(search_term) for search in searches: for page_num in range(int(page_start), int(page_end)): page = requests.get( 'http://www.tynker.com/tools/community?t=codeathon&v=published&s=' + str(page_num), auth=(username, pswd)) soup = BeautifulSoup(page.text, 'html.parser') projects = soup.find_all('li', class_="card") for proj in projects: if str(search) in proj.text: print '\n' print "Page: " + str(page_num + 1) link = proj.find('a', href=re.compile('community-details')) print 'http://www.tynker.com/tools/' + str(link['href']) proj_name = proj.find('div', class_="card-title") print proj_name.get_text() print '\n'
def getExternalLinks(bsObj, includeUrl): externalLinks = [] for links in bsObj.find("a", href=re.compile("^(www|http)((?!" + includeUrl + ").)*$")): if links.attrs['href'] is not None: if links.attrs['href'] not in externalLinks: externalLinks.append(links.attrs['href']) return externalLinks
def _podcastParser(self, html_content): soup = BeautifulSoup(html_content, 'html.parser') episodes = soup.find_all('div', attrs={'class': re.compile('episode ')}) self.__dict__.clear() for x, episode in enumerate(episodes): episode_num = episode.find('span', attrs={ 'class': 'episode-num' }).get_text().strip('#') podcast_date = episode.find('h3').get_text() desc = episode.find('div', attrs={ 'class': 'podcast-content' }).get_text() title = episode.find('div', attrs={'class': 'podcast-details'}) title = title.find('h3').get_text() podcast_mp3 = '%s%s.mp3' % (self.mp3_url, 'mmashow' + episode_num if 'mma' in title.lower() else 'p' + episode_num) dllinks = episode.find('ul', attrs={'class': 'download-links'}) vimeo_link = dllinks.find('a')['href'] self.__dict__[x] = { 'episode': episode_num, 'title': title, 'date': podcast_date, 'desc': desc.encode(), 'mp3_url': podcast_mp3, 'vimeo_url': vimeo_link } return self.__dict__
def getInternalLinks(bsObj, includeUrl): internalLinks = [] for links in bsObj.find("a", href=re.compile("^(/|.*" + includeUrl + ")")): if links.attrs['href'] is not None: if links.attrs['href'] not in internalLinks: internalLinks.append(links.attrs['href']) return internalLinks
def parse(self): soup = BeautifulSoup(self.source, 'html.parser') self.name = soup.find('h2').string raw_price = str(soup.find('h3')) loc = raw_price.index('$') self.price = raw_price[loc:loc + 7] self.distance = soup.find('span', text=re.compile('mi\)')).string
def f(item): title, url, lang = item page = spider.session.get(url) pat = re.compile("scope.code.{} = '(.+)'".format(lang)) code = pat.findall(page.text)[0] jsoncode = json.loads('{"code": "%s"}' % code) codepath = set_save_path(title, lang) self.print_to_file(jsoncode['code'], codepath)
def executor(item): title, url, lang = item page = spider.session.get(url) pat = re.compile("scope.code.{} = '(.+)'".format(lang)) code = pat.findall(page.text)[0] jsoncode = json.loads('{"code": "%s"}' % code) codepath = set_save_path(title, lang) self.print_to_file(jsoncode['code'], codepath)
def print_info(url_address): html = urlopen(url_address) bsObj = BeautifulSoup(html,"html.parser") a_tags = bsObj.findAll("a",{"href":re.compile("\/buy\/[0-9]*\/[a-zA-Z0-9\-]*")}) for a in a_tags: print(a["title"]) price_tags = a.findAll("span",{"class":"Price"}) for price in price_tags: if(price.get_text()): print(price.get_text().strip(" \t\n\r"))
def scrape_links(url, soup): first = soup.find('a', attrs={'href': re.compile("/")}) try: l1 = first.get('href') a = [first] links = [l1] n = 0 length = len(soup.findAll('a',attrs={'href': re.compile("/")})) while n<(length-1): #extracting the relevant a tags and putting into an array blah = a[n].find_next('a', attrs={'href': re.compile("/")}) a.append(blah) #extracting the href links from a and placing them in an array link = blah.get('href') clean_link = parse.urljoin(url, link) links.append(clean_link) n+=1 return links except: print("\nThis Link is not scrapable!!!!!")
def getbirthchart(month, day, year, hour, minute, ampm, town, country, state= None): url = 'http://alabe.com/cgi-bin/chart/astrobot.cgi?INPUT1=&INPUT2=&MONTH=%d&DAY=%d&YEAR=%d&HOUR=%d&MINUTE=%d&M=%s&TOWN=%s&COUNTRY=%s&STATE=%s&INPUT9=&Submit=Submit' % (month, day, year, hour, minute, ampm, town, country, state) response = urllib2.urlopen(url) html = response.read() soup = BeautifulSoup(html, "html.parser") chart = soup.find_all(string=re.compile("Degree")) chartlist = [] for line in chart: planet = line.split(None, 1)[0] sign = line.rsplit(None, 1)[-1] chartlist.append((planet, sign)) print chartlist
def get_table(self, url): soup = self.get_soup(url) if soup.find(text=re.compile('available')): raise PageError('No Such Page', url) if '/tag/' in url: pat = re.compile( '"id".+?"(\d+)".+?"title".+?"(.+?)".+?"ac_rate".+?"(.+?)".+?"difficulty".+?"(.+?)"', re.S | re.X | re.U) raw_script = soup.body.find_all('script')[3].text table = [] for data in pat.findall(raw_script): num, title, ac_rate, diff = data title, diff = BeautifulSoup(title), BeautifulSoup(diff) table.append( (num, title.text, ac_rate, diff.text, title.a['href'])) else: t = soup.find(id='problemList').find_all('tr')[1:] table = [tuple(i.stripped_strings) + (i.a['href'], ) for i in t] return table
def get_table(self, url): soup = self.get_soup(url) if soup.find(text=re.compile('available')): raise PageError('No Such Page', url) if '/tag/' in url: pat = re.compile(r'"id".+?"(\d+)".+?' r'"title".+?"(.+?)".+?' r'"ac_rate".+?"(.+?)".+?' r'"difficulty".+?"(.+?)"', re.S | re.X | re.U) raw_script = soup.body.find_all('script')[3].text table = [] for data in pat.findall(raw_script): num, title, ac_rate, diff = data title, diff = BeautifulSoup(title), BeautifulSoup(diff) table.append((num, title.text, ac_rate, diff.text, title.a['href'])) else: tmp = soup.find(id='problemList').find_all('tr')[1:] table = [tuple(i.stripped_strings) + (i.a['href'],) for i in tmp] return table
def crawl_moveek(URL): soup = get_URL(URL) movies = soup.find_all(href=re.compile("/phim/")) movies_list = list() for movie in movies: _movie = {} if movie.img: _movie["title"] = movie["title"] _movie["link"] = movie["href"] _movie["img"] = movie.img["data-src"] movies_list.append(_movie) return movies_list
def get_links(url): global pages, pages_mit soup=get_soup_obj('http://www.pythonscraping.com/'+url) # search href, start with '/' means internal link # links=soup.find_all("a",{'href': re.compile("^(/node/|/blog|/)")}) links = soup.find_all("a", {'href': re.compile("^(/)")}) for link in links: # print(link.attrs['href']) if 'href' in link.attrs: if link.attrs['href'] not in pages: newPage=link.attrs['href'] pages.add(newPage) print(newPage) get_links(newPage)
def verify(url_add): current_link = url_add next_link = "" last_link = "" html = urlopen(current_link) bsObj = BeautifulSoup(html,"html.parser") pager_tags = bsObj.findAll("div",{"class":"Pager"}) pages_tag = pager_tags[0] print(pages_tag) if(pages_tag): page_tags = pages_tag.findAll("a",{"href":re.compile("\/Shop-[a-zA-Z]{6}\/[0-9]*\/[a-zA-Z0-9]*\?page=[0-9]*")}) if(page_tags): last_page_tag = page_tags[-1] last_link = "http://www.chemistwarehouse.com.au" + last_page_tag["href"] print(last_link) while True: html = urlopen(current_link) bsObj = BeautifulSoup(html,"html.parser") pager_tags = bsObj.findAll("div",{"class":"Pager"}) for pages_tag in pager_tags: if(pages_tag): page_tags = pages_tag.findAll("a",{"href":re.compile("\/Shop-[a-zA-Z]{6}\/[0-9]*\/[a-zA-Z0-9]*\?page=[0-9]*")}) if(page_tags): next_page_tag = page_tags[-2] next_link = "http://www.chemistwarehouse.com.au" + next_page_tag["href"] print(next_link) print_info(current_link) if(next_link == last_link): print_info(next_link) break else: current_link = next_link next_link = ""
def save_defaultcode(soup, pdir, langlist): tag = soup.find(lambda x: x.has_attr('ng-init')) rawjson = tag['ng-init'] pat = re.compile(r'(\[.+\])') raw = pat.findall(rawjson)[0].replace("'", '"') # ' -> " raw = ''.join(raw.rsplit(',', 1)) # remove the last ',' in json list codelist = json.loads(raw) codelist = filter(lambda x: x['value'] in langlist, codelist) codedict = {i['value']: i['defaultCode'] for i in codelist} for lang in codedict.keys(): codepath = os.path.join(pdir, self.SAVENAME[lang]) if not os.path.isfile(codepath): self.print_to_file(codedict[lang], codepath) elif self.DEBUG: print('{} already exists!'.format(codepath))
def save_defaultcode(soup, pdir, langlist): tag = soup.find(lambda x: x.has_attr('ng-init')) rawjson = tag['ng-init'] pat = re.compile('(\[.+\])') raw = pat.findall(rawjson)[0].replace("'", '"') # ' -> " raw = ''.join(raw.rsplit(',', 1)) # remove the last ',' in json list codelist = json.loads(raw) codelist = filter(lambda x: x['value'] in langlist, codelist) d = {i['value']: i['defaultCode'] for i in codelist} for lang in d.keys(): codepath = os.path.join(pdir, self.SAVENAME[lang]) if not os.path.isfile(codepath): self.print_to_file(d[lang], codepath) elif self.DEBUG: print('{} already exists!'.format(codepath))
def crawl_rating_moveek(URL): movies_list = crawl_moveek(URL) for i in range(len(movies_list)): movie = movies_list[i] soup = get_URL("https://moveek.com" + movie["link"]) movie["gerne"] = soup.find(class_="mb-0 text-muted text-truncate" ).string.strip().strip("-").strip() try: movie["description"] = soup.find(class_="mb-3 text-justify").text except: if "description" not in movie: soup = get_URL("https://moveek.com/" + movie["link"].strip("/en")) movie["description"] = soup.find( class_="mb-3 text-justify").text movie["rating"] = soup.find(href=re.compile("/review/")).text.strip() if movie["rating"] == "Reviews" or movie["rating"] == "Đánh giá": movie["rating"] = "No Review" return movies_list
def get_zones_dict(df): selected = df[columns] temp_df = df["gate_arrival_actual_timezone_code"] for i in range(1, len(columns)): temp_df = temp_df.append(selected[columns[i]], ignore_index=True) zones = [z.lower() for z in temp_df.unique() if "GMT" not in z] zones_dict = {} for zone in zones: url = 'https://www.timeanddate.com/time/zones/' + zone page = rq.get(url) content = page.content soup = BeautifulSoup(content, 'html.parser') scraped_zone = soup.find_all("ul", {"class": "clear"}) if len(scraped_zone) > 0: p = re.compile(r'UTC [+-][0-9]{1,2}\b') search = p.search(scraped_zone[0].text) group = search.group(0) result = re.sub('[\s]', '', group) zones_dict[zone] = result.replace("UTC", "Etc/GMT") return zones_dict
def scrape_game_data(url): # start browser browser = init_browser() # navigate to website browser.visit(url) # prepare soup for scraping html = browser.html soup = bs(html, 'html.parser') # find html text that contains the title of the table we want to scrape from target_title = soup.find_all(text=re.compile('Top Live')) # find the header that has our target title and extract the text target_h4 = soup.find('h4', text=target_title) table_title = target_h4.get_text() # find tables that are in the same parent tag tables = target_h4.parent.findAll('table') # extract data from table into lists streamers = [] viewers = [] for column in tables: streamer_list = column.find_all('a') for streamer in streamer_list: streamer_name = streamer.get_text() streamers.append(streamer_name) viewers_list = column.find_all('span') for streamer in viewers_list: viewership = streamer.get_text() viewers.append(viewership) viewers = [int(viewer.replace(',', '')) for viewer in viewers] # close browser browser.quit() # create dictionary for use in dataframe using lists created data_dict = {'Channel': streamers, 'Viewers': viewers} # create dataframe using dictionary created mytable = pd.DataFrame(data_dict) # set the channel as the index mytable.set_index('Channel', inplace=True) return mytable
def getLink(link): #这个函数返回网页中的各个子链接 #fix bug on 7/12/2015:抓到的链接都不是实际的地址,比如抓到一个http://www.chnaus.com/thread-230167-1-1.html #将这个地址填如浏览器,实际地址变成http://www.chnaus.com/thread-230167-1-1.html?_dsign=15b50450 #后面出现的那个东东,是由javascript生成的。因此如果将http://www.chnaus.com/thread-230167-1-1.html #这个放入BeautifulSoup抓到的只是javascript脚本。 return_link=[] try: html = urlopen(link) bsObj= BeautifulSoup(html,'html.parser') #fix bug on 07/12/2015 #thread\-[0-9]{6}\-[0-9]\-[0-9]\.html ---->thread\-[0-9]{6}\-[0-9]\-[0-9]+\.html #当page是1-9是一位数,当10-99是两位数,所以在后面弄个+ #同时当爬到好几百页之后的时候,第一个[0-9]的组合有可能是5位,而不是六位 #因此thread\-[0-9]{6}--->thread\-[0-9]{4,6} a_tags = bsObj.findAll('a',{'class':True,'onclick':True,'href':re.compile('http:\/\/www\.chnaus\.com\/thread\-[0-9]{4,6}\-[0-9]\-[0-9]+\.html')}) if a_tags: for a_tag in a_tags: return_link.append(a_tag['href']) finally: return return_link
def verify(url_add): current_link = url_add next_link = "" last_link = "" while True: html = urlopen(current_link) bsObj = BeautifulSoup(html,"html.parser") pager_tag = bsObj.findAll("div",{"class":"Pager"}) pages_tag = pager_tag.findAll("a",{"href":re.compile("\/Shop-Online\/[0-9]*\/\?page=[0-9]*")}) next_page_tag = pages_tag[-2] last_page_tag = pages_tag[-1] next_link = "http://www.chemistwarehouse.com.au" + next_page_tag["href"] last_link = "http://www.chemistwarehouse.com.au" + last_page_tag["href"] print(current_link) if(next_link == last_link): print(next_link) break else: current_link = next_link next_link = ""
def getLinks(pageUrl): global pages # make sure the global name is used html = urlopen("http://en.wikipedia.org" + pageUrl) bsObj = BeautifulSoup(html) # testing the structure of the pages try: print(bsObj.h1.get_text()) print(bsObj.find(id="mw-content-text").findAll("p")[0]) print(bsObj.find(id="ca-edit").find("span").find("a").attrs['href']) except AttributeError: print("Cannot find the main text or edit page") for links in bsObj.findAll("a", href=re.compile("^(/wiki/)")): if 'href' in links.attrs: if links.attrs['href'] not in pages: newPage = links.attrs['href'] pages.add(newPage) print(newPage) getLinks(newPage) else: print('How is it possible?') print(links) print('How is it possible?')
def get_pic_request(self): print('Start the GET') r = self.requests(self.web_url) print('Start to find all the <img>') text = BeautifulSoup(r.text, 'lxml') all_images = text.find_all('img', alt=re.compile("keywords")) # find all the img with a "keywords" print('create file') self.create_folder(self.folder_path) print('change the current file to it') os.chdir(self.folder_path) # change the path to the target. i = 0 all_pics = self.get_files(self.folder_path) for img in all_images: name_start_pos = img.index('photo') name_end_pos = img.index('?') name = img[name_start_pos:name_end_pos] + '.jpg' if name in all_pics: print("this pic is already existing") continue else: print(img) self.save_img(img['src'], name) i +=1
from urllib.request import urlopen from bs4 import BeautifulSoup from bs4 import re html = urlopen("http://www.chemistwarehouse.com.au/Shop-Online/587/Swisse") bsObj = BeautifulSoup(html,"html.parser") #print(bsObj.prettify()) #nameList = bsObj.findAll("span",{"class":"Price"}) a_tags = bsObj.findAll("a",{"href":re.compile("\/buy\/[0-9]*\/[a-zA-Z0-9\-]*")}) for a in a_tags: print(a["title"]) price_tags = a.findAll("span",{"class":"Price"}) for price in price_tags: if(price.get_text()): print(price.get_text().strip(" \t\n\r"))
from bs4 import BeautifulSoup,re html = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = BeautifulSoup(html) ''' #print soup.prettify() print soup.a print type(soup.a) print soup.p.string print soup.attrs print soup.body.contents for child in soup.body.children: print child for child in soup.descendants: print child ''' print soup.find_all(href=re.compile("elsie"), id='link1')
from urllib.request import urlopen from bs4 import BeautifulSoup from bs4 import re html = urlopen("https://en.wikipedia.org/wiki/Kevin_Bacon") bsObj = BeautifulSoup(html,"html.parser") for link in bsObj.find("div",{"id":"bodyContent"}).findAll("a",href = re.compile("^(/wiki/)((?!:).)*$")): if "href" in link.attrs: print(link["href"])
from urllib.request import urlopen from bs4 import BeautifulSoup from bs4 import re html = urlopen("http://www.pythonscraping.com/pages/page3.html") bsObj = BeautifulSoup(html) images = bsObj.findAll("img", {"src": re.compile("\.\./img/gifts/img.*\.jpg")}) for image in images: print(image["src"])
def scrape_streamer_data(url): # start browser browser = init_browser() # navigate to website browser.visit(url) #click on month tab in performace section browser.find_by_text('Month').click() # prepare soup for scraping html = browser.html soup = bs(html, 'html.parser') # create empty dictionary to store data we scrape data_dict = {} # scrape for channel_name channel_name = soup.find('div', id='mini-profile').find('h4').get_text() data_dict['channel_name'] = channel_name # scrape for streamer's avg_viewers target_text = soup.find_all(text=re.compile('Avg viewers')) target_div = soup.find('div', text=target_text) avg_viewers = float( target_div.parent.find_all('span')[1].get_text().replace(',', '')) data_dict['avg_viewers'] = avg_viewers # scrape for streamer's time_streamed target_text = soup.find_all(text=re.compile('Hours streamed')) target_div = soup.find('div', text=target_text) time_streamed = float( target_div.parent.find_all('div')[2].get_text().replace(',', '')) data_dict['time_streamed(hrs)'] = time_streamed # scrape for streamer's all_time_peak_viewers target_text = soup.find_all( text=re.compile('Highest recorded number of concur. viewers')) target_div = soup.find('div', text=target_text) all_time_peak_viewers = float( target_div.parent.find_all('div')[2].get_text().replace(',', '')) data_dict['all_time_peak_viewers'] = all_time_peak_viewers # scrape for streamer's hours_watched target_text = soup.find_all(text=re.compile('Hours watched')) target_div = soup.find('div', text=target_text) hours_watched = float( target_div.parent.find_all('div')[2].get_text().replace(',', '')) data_dict['hours_watched'] = hours_watched # scrape for streamer's overall_rank target_text = soup.find_all(text=re.compile('RANK')) target_div = soup.find('span', text=target_text) overall_rank = float( target_div.parent.find_all('span')[1].get_text().replace(',', '')) data_dict['overall_rank'] = overall_rank # scrape for streamer's followers_gained target_text = soup.find_all(text=re.compile('Followers gained')) target_div = soup.find('div', text=target_text) followers_gained = float( target_div.parent.find_all('div')[2].get_text().replace(',', '')) data_dict['followers_gained'] = followers_gained # scrape for streamer's total_followers target_text = soup.find_all(text=re.compile('Total followers')) target_div = soup.find('div', text=target_text) total_followers = float( target_div.parent.find_all('div')[2].get_text().replace(',', '')) data_dict['total_followers'] = total_followers # scrape for streamer's total_views target_text = soup.find_all(text=re.compile('Total views')) target_div = soup.find('div', text=target_text) total_views = float( target_div.parent.find_all('div')[2].get_text().replace(',', '')) data_dict['total_views'] = total_views # close browser browser.quit() # create dataframe using dictionary created mytable = pd.DataFrame([data_dict]) # set the channel as the index mytable.set_index('channel_name', inplace=True) return mytable
DB = 'geipan' mongo = MongoClient('mongodb://' + HOST) db = mongo[DB] cursor=db['Cas'].find({}, no_cursor_timeout=True).batch_size(20) idCasFail = list() for _cas in cursor: res = requests.get('http://www.cnes-geipan.fr/index.php?id=202&cas=' + _cas['cas_numEtude']) if res.status_code == 200: resContent = res.content soup = BeautifulSoup(resContent, 'html.parser') try: #links = soup.find("div", {"class": "tx-geipansearch-pi1"}).find_all('a') newObject = _cas; links = soup.find_all(href=re.compile("geipan-doc")) newObject['files'] = [{"name": _link.text, "link": _link.get('href')} for _link in links] print(str(_cas['_id']) + ' updated') db['Cas'].update_one({'_id': _cas['_id']}, {'$set': newObject}) except: print("file not found -> append list") idCasFail.append(str(_cas['_id'])) else: print("request fail -> append list") idCasFail.append(str(_cas['_id'])) cursor.close() print(idCasFail)
directory = "downloaded" if not os.path.exists(directory): os.makedirs(directory) course=input(" enter course you want to search") html = urlopen("https://www.coursera.org/courses?query="+course) print(course) b=BeautifulSoup(html,'html.parser') col=b.findAll("div",{"class":"card-info"}) cno=0 for i in col: course_name=i.find("",{"class":re.compile("\.*card-title\.*")}).get_text() print ("name:",course_name) # cno=cno+ 1 link="https://www.coursera.org"+i.find_parent("a" ).attrs['href'] tag=i.find("",{"class":re.compile("\.*product-badge\.*")}).get_text() print ("badge:",tag) course_by=i.find("",{"class":re.compile("\.*card-description\.*")}).get_text() print ("by:",course_by) print ("link foR MORE details :",link) if tag.lower() == "course": newx=BeautifulSoup(urlopen(link),'html.parser') free=-1 duration=-1 cost=-1
__author__ = 'charlesw' from urllib.request import urlopen from bs4 import BeautifulSoup from bs4 import re html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html") bsObj = BeautifulSoup(html, "html.parser") nameList = bsObj.findAll("span", {"class":"green"}) #print(bsObj.findAll(id="text")) # this skip the name input, just checking on attr input for name in nameList: print(name.get_text()) html = urlopen("http://www.pythonscraping.com/pages/page3.html") bsObj = BeautifulSoup(html) for child in bsObj.find("table",{"id":"giftList"}).children: print(child) print(bsObj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text()) for i in bsObj.find("table",{"id":"giftList"}).findAll("img"): print (i.parent.previous_sibling.previous_sibling.previous_sibling.get_text() +' -- '+i.parent.previous_sibling.get_text()) images = bsObj.findAll("img", {"src":re.compile("\.\.\/img\/gifts/img.*\.jpg")}) for image in images: print(image["src"]) print (image.parent.previous_sibling.previous_sibling.previous_sibling.get_text() +' -- '+image.parent.previous_sibling.get_text())
def main(argv): url = '' outputfile = '' try: opts, args = getopt.getopt(argv, "hu:o:", ["url=", "ofile="]) except getopt.GetoptError: print('wikiannex.py -u <url> -o <outputfile>') sys.exit(2) for opt, arg in opts: if (opt == '-h'): print('wikiannex.py -u <url> -o <outputfile>') sys.exit() elif opt in ("-u", "--url"): url = arg elif opt in ("-o", "--ofile"): outputfile = arg else: assert False, "unhandled option" bsObj = urlToBeautifulSoup(url) ids = bsObj.findAll('span', {'class': 'mw-headline'}) elements = {} for id in ids: element = {} name = id.get_text() element['qid'] = '' element['title'] = name element['href'] = '' element['data'] = {} if (name == 'Referencias'): break parent = id.parent article = parent.find_next_sibling() if (article is not None and article.name == 'div'): articleHref = article.find('i').find('a')['href'] wikilink = '{}{}'.format(wikipedia, articleHref) element['href'] = articleHref nextNode = parent.find_next_sibling() while True: if (nextNode.name == 'ul'): break elif (nextNode.name == 'h2'): break nextNode = nextNode.find_next_sibling() continue # if (nextNode.name == 'h2'): # nextNode = parent.find_next_sibling() # element['media'] = {} # counter = 0 # while True: # if (nextNode.name == 'p'): # element['media']["p{}".format(counter)] = nextNode.get_text() # counter = counter+1 # elif (nextNode.name == 'h2'): # break # nextNode = nextNode.find_next_sibling() # continue # else: # li = nextNode.findAll('li') # actives = {} # for l in li: # text = (l.get_text()) # if ':' in text: # category, other = text.split(':', 1) # actives[category.strip()] = other # element['media'] = actives href = element.get('href', False) if href: bsWikilink = urlToBeautifulSoup(wikilink) wikidataUrl = bsWikilink.find( 'a', {'href': re.compile('https://www.wikidata.org/wiki/Q')}) if (wikidataUrl is None): continue wikidataHref = wikidataUrl['href'] bsWikidata = urlToBeautifulSoup(wikidataHref) data = getData(bsWikidata) qid = 'Q{}'.format(wikidataHref.split('Q')[1]) element['qid'] = qid element['data'] = data elements[element['title']] = element print(elements)
outpic.write(imageData) if os.stat(ImageUrl.split("/")[-1]).st_size > 0: print("saved image:", ImageUrl.split("/")[-1]) elif retry_count < 3: print(ImageUrl.split("/")[-1], "size is 0. imageData size:", len(imageData), ". Retrying...") sleepCountdown(2, 1) savePic(ImageUrl, retry_count = retry_count + 1) else: print(ImageUrl.split("/")[-1], "failed. Giving up.") url = "https://www.onemotoring.com.sg/content/onemotoring/home/driving/traffic_information/traffic-cameras/woodlands.html" Cameras = {'4703': 'Tuas Second Link', '4713': 'Tuas Checkpoint', '2701': 'Woodlands Causeway Towards Johor', '2702': 'Woodlands Checkpoint (Towards BKE)'} time_interval = 55 # in seconds while True: print("getting data...") r = requests.get(url=url) images = BeautifulSoup(r.content, "lxml").find_all(src = re.compile("mytransport")) # get the image links print("Image links obtained. Length", len(images)) for image in images: savePic("http:" + image["src"]) # save the image print("Waiting for", time_interval, "seconds... Press Ctrl+C to stop.") sleepCountdown(time_interval, 2)
html = urlopen("http://www.pythonscraping.com/pages/page3.html") b=BeautifulSoup(html,'html.parser') for child in b.find("table",{"id":"giftList"}).children: print(child) print("888888888888888888888888888888888888888888888888888888888888888888888888888888") for child in b.find("table",{"id":"giftList"}).descendants: print(child) print ("1111111111111111111111111111111111111111111111") for sibling in b.find("table",{"id":"giftList"}).tr.next_siblings: print(sibling) print ("1111111111111111111111111111111111111111111111") for sibling in b.find("table",{"id":"giftList"}).tr: print(sibling) print(b.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text()) images = b.findAll("img", {"src":re.compile("\.\.\/img\/gifts\/img.*\.jpg")}) for image in images: print(image["src"]) print(image.attrs["src"]) print(image["src"]) b.findAll(lambda tag: len(tag.attrs) == 2)
# coding: utf-8 # In[ ]: from urllib.request import urlopen from bs4 import BeautifulSoup, re html = urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon") bs_obj = BeautifulSoup(html) #links = bs_obj.findAll('a') body_content = bs_obj.find("div", {"id": "bodyContent"}) links = body_content.findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")) for link in links: if 'href' in link.attrs: print(link.attrs['href'])