def get_data(result): soup = Bsoup(result.text, 'html.parser') Day = soup.find( "div", class_="today_nowcard-main component panel today-card-other-fog") Place = Day.header.h1.contents[0] Last_Update = Day.find( "p", class_="today_nowcard-timestamp").span.text + Day.find( "p", class_="today_nowcard-timestamp").span.next_sibling.text temp = Day.find("div", class_="today_nowcard-temp") weather_type = Day.find("div", class_="today_nowcard-phrase") feels_like = Day.find("div", class_="today_nowcard-feels") UV_index = list( Day.find("div", class_="today_nowcard-hilo").stripped_strings) UVstring = "".join(UV_index) UV_index_text = UVstring[0:4] + " " + UVstring[4:7] + "\n" + UVstring[ 7:15] + " " + UVstring[15:] return [Place, Last_Update, temp, weather_type, feels_like, UV_index_text]
def get_top_exts(web_store_scrape_file, with_num_ratings=False): """ Scrape the file, return the extension IDs. :param web_store_scrape_file: Should be a path to a HTML file taken from the Chrome Web Store showing the "Popular" category. There's no guarantee that the CSS and tag attributes used to locate the desired information will work in the future. Check the Web Store's source to be sure. :type web_store_scrape_file: str :param with_num_ratings: Flag indicates if the number of reviews should also be returned. If True, the return type will be a dictionary. :type with_num_ratings: bool :return: The list of extension IDs. :rtype: tuple|dict """ soup = Bsoup(web_store_scrape_file, "lxml") ext_num_ratings = {} for tile in soup.find_all('div', class_='webstore-test-wall-tile'): link = tile.a.get('href') ext_id = id_from_url(link) rating = tile.find('div', attrs={'g:type': "AverageStarRating"}) num_ratings = int( rating.span.string[1:-1]) # Number with parentheses around them ext_num_ratings[ext_id] = num_ratings if with_num_ratings: return ext_num_ratings return tuple(ext_num_ratings.keys())
def conta_nodi(fileIn, selettore): '''Torna il numero di nodi dell'albero, che soddisfano il selettore CSS.''' albero=my_html.fparse(fileIn) page=open(fileIn) soup=Bsoup(page,'html.parser') cont=0 if selettore.startswith('#') or selettore.startswith('.'): return conto(albero,soup,selettore,cont) if selettore.startswith('@'): lista=[] for lett in selettore: lista.append(lett) lista.remove('@') lista.remove('[') lista.remove(']') selettore2=''.join(lista) b=selettore2.split('=') attributo=b[1] for elemento in soup.find_all(width=attributo): cont+=1 return cont if len(selettore)==3: listaselettori=selettore.split(' ') avo=listaselettori[0] discendente=listaselettori[1] for elemento in soup.find_all(avo): for el in elemento.find_all(discendente): cont+=1 return cont for k in soup.find_all(selettore): cont+=1 return cont
def scrape_page(input_tag) : #store the url of the page holding the quotes. # website https://www.goodreads.com/quotes/tag page_URL = "https://www.goodreads.com/quotes/search?utf8=%E2%9C%93&q="+ str(input_tag)+"&commit=Search" #open connection and get content. try : page_client = urlopen(page_URL) page_html = page_client.read() #close page_client.close() except HTTPError as e : page_html = e.read() #parse data to html soup = Bsoup(page_html, "html.parser") #access the data on the page. outer_container = soup.findAll("div", {"class":"quoteDetails"}) quote_list = [] #loop through all quote containers to get the actual quote. for container in outer_container : #inside first div the the text. Text needs to be formatted. quote = (container.div.text) quote = quote.split("//", maxsplit = 1)[0].strip().replace("\n"," ") if len(quote) <= 264 : quote_list.append(quote) return quote_list
def __getImagesUrl(self, name): url = self.url + name print(url) driver = wd.Chrome(self.wd_path) driver.get(url) scroll_value = self.scroll_count for i in range(self.scroll_full_count): driver.execute_script("window.scrollTo(0, " + str(scroll_value) + ")") scroll_value += self.scroll_count time.sleep(0.1) soup = Bsoup(driver.page_source, "html.parser") images = soup.find_all("img", attrs={"class": "_2UpQX"}) urls_arr = [] for image in images: if image.get("src") != None: url = image.get("src") urls_arr.append(url) print(urls_arr) return urls_arr
def get_soups(urls): soups = [] for url in urls: url_o = requests.get(url) soup = Bsoup(url_o.text, 'html.parser') soups.append(soup) return (soups)
def scrape_page(): page_URL = "https://www.teenvogue.com/gallery/best-harry-potter-quotes" #open connection and get content. # scraper is not universal page_client = urlopen(page_URL) page_html = page_client.read() #close page_client.close() #parse data to html soup = Bsoup(page_html, "html.parser") #access the data on the page. outer_container = soup.findAll("div", {"class": "gallery-slide-caption__dek"}) quote_list = [] #loop through all quote containers to get the actual quote. for container in outer_container: #inside first div the the text. Text needs to be formatted. quote = (container.div.text) quote = quote.strip().replace("\n", " ") if len(quote) <= 264: quote_list.append(quote) return quote_list
def searchAuthors(request): keyword = request.POST.get('keyword', False) Authors = [] Authorlink = [] url = 'http://scholar.google.com/citations?view_op=search_authors&mauthors=' + str( keyword) + '&hl=en&oi=drw' response = requests.get(url) html_soup = Bsoup(response.text, 'html.parser') name_containers = html_soup.find_all('div', class_='gsc_1usr') interests = [] for container in name_containers: interest = container.find_all('a', class_='gs_ai_one_int') mylist = [] for i in interest: mylist.append(i.text) interests.append(mylist) Authors.append(container.div.h3.a.text) k = str(container.div.h3.a['href']) Authorlink.append( "https://scholar.google.com/citations?view_op=medium_photo&user=" + k[k.index('='):]) author = zip(Authors, Authorlink, interests) return render(request, 'profiles/index.html', context={'author': author})
def parse_page(url): x = Ureq(url) page = x.read() x.close() page_parsed = Bsoup(page, 'html.parser') return (page_parsed)
def main(): print("hey") names = [] prices = [] changes = [] percentChanges = [] marketCaps = [] totalVolumes = [] circulatingSupplys = [] result = requests.get(url) soup = Bsoup(result.content, 'html.parser') print("Haefd") for listing in soup.find_all( 'tr', attrs= { 'class': 'simpTblRow Bgc($extraLightBlue):h BdB Bdbc($finLightGrayAlt) Bdbc($tableBorderBlue):h H(32px) Bgc(white)' }): for name in listing.find_all('td', attrs={'aria-label': 'Name'}): names.append(name.text) for price in listing.find_all('td', attrs={'aria-label': 'Price (intraday)'}): prices.append(price.find('span').text) for change in listing.find_all('td', attrs={'aria-label': 'Change'}): changes.append(change.text) for percentChange in listing.find_all('td', attrs={'aria-label': '% change'}): percentChanges.append(percentChange.text) for marketCap in listing.find_all('td', attrs={'aria-label': 'Market cap'}): marketCaps.append(marketCap.text) for totalVolume in listing.find_all( 'td', attrs={'aria-label': 'Avg vol (3-month)'}): totalVolumes.append(totalVolume.text) for circulatingSupply in listing.find_all( 'td', attrs={'aria-label': 'Volume'}): circulatingSupplys.append(circulatingSupply.text) print(len(names)) #for i in range(0, len(names)): # print(float(changes[i][0:len(percentChanges[i]) - 1])) # print(float(percentChanges[i][0:len(percentChanges[i])-1])) print(si.get_day_gainers().columns) #is a pandas data frame
def check_file(fname): global total_img_cnt content = open(fname, 'r').read() #lcontent = content.lower() soup = Bsoup(content, features="html.parser") img_cnt = 0 for imgtag in soup.find_all('img'): img_cnt += 1 total_img_cnt += 1 img_path = imgtag['src'] print('%d) tag path: %s' % (img_cnt, img_path)) fullpath = join(OUTPUT_DIR, img_path) if not isfile(fullpath): print('broken path: %s' % fullpath) sys.exit(0) print('looks good! images checked: %s' % img_cnt)
def find_links(name): name = name.replace(" ", "+") url_str = 'https://www.google.com/search?ei=1m7NWePfFYaGmQG51q7IBg&hl=en&q={}' + \ '\&tbm=isch&ved=0ahUKEwjjovnD7sjWAhUGQyYKHTmrC2kQuT0I7gEoAQ&start={}' + \ '\&yv=2&vet=10ahUKEwjjovnD7sjWAhUGQyYKHTmrC2kQuT0I7gEoAQ.1m7NWePfFYaGmQG51q7IBg' + \ '\.i&ijn=1&asearch=ichunk&async=_id:rg_s,_pms:s' headers = { "User-Agent": "Chrome/65.0.3325.162 Safari/537.36", "Content-Type": "application/json" } url_str = url_str.format(name, 0) print(url_str) request = ulib.Request(url_str, None, headers) json_str = ulib.urlopen(request).read() json_str = json.loads(json_str) soup = Bsoup(json_str[1][1], 'lxml') soup_imgs = soup.find_all("img") img_links = [img["src"] for img in soup_imgs] return img_links
def scrape(request): session = requests.Session() session.headers = {"User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"} instance=Quotes.objects.all() instance.delete() i=1 for i in range(1,4): url='https://www.goodreads.com/quotes?page='+str(i) source= requests.get(str(url)).content soup=Bsoup(source,'lxml') articles=soup.find_all('div',class_='quoteText') for article in articles: #main = article.find('a') #link = main['href'] #image_src = main.find('img')['src'] text= article.text new_quote=Quotes() new_quote.quote=text #new_quote.image=image_src if len(new_quote.quote)<=255: new_quote.save() return redirect('../')
import requests from bs4 import BeautifulSoup as Bsoup from pymongo import MongoClient as Mongo client = Mongo('localhost', 27017) db = client.m_list headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' } data = requests.get('https://www.genie.co.kr/chart/top200?ditc=D&rtm=Y', headers=headers) soup = Bsoup(data.text, 'html.parser') music_lists = soup.select( '#body-content > div.newest-list > div > table > tbody > tr') for music in music_lists: music.select_one('td.number > span').decompose() rank = music.select_one('td.number').text.strip() title = music.select_one('a.title.ellipsis').text.strip() artist_name = music.select_one('a.artist.ellipsis').text doc = {'rank': rank, 'title': title, 'artits': artist_name} db.m_list.update_one({'rank': rank}, {'$set': {'title': title}}) db.m_list.update_one({'rank': rank}, {'$set': {'artits': artist_name}})
def get_result(url): res = requests.get(url) default = 'Not Available' soup = Bsoup(res.content, 'html.parser') items = soup.find_all('li', attrs={'class': 'ais-InfiniteHits-item'}) courses = list() for item in items: course = {} course_title_src = item.find('h2') if course_title_src: course_title = course_title_src.get_text() else: course_title = default partner_src = item.find('span', attrs={'class': 'partner-name'}) if partner_src: partner = partner_src.get_text() else: partner = default rating_value_src = item.find('span', attrs={'class': 'ratings-text'}) if rating_value_src: rating_value = rating_value_src.get_text() else: rating_value = default rating_count_src = item.find('span', attrs={'class': 'ratings-count'}) if rating_count_src: rating_count = rating_count_src.get_text()[1:-1] else: rating_count = default enrollment_number_src = item.find('span', attrs={'class': 'enrollment-number'}) if enrollment_number_src: enrollment_number = enrollment_number_src.get_text() else: enrollment_number = default difficulty_level_src = item.find('span', attrs={'class': 'difficulty'}) if difficulty_level_src: difficulty_level = difficulty_level_src.get_text() else: difficulty_level = default item_type_src = item.find('div', attrs={'class': 'product-type-row'}) if item_type_src: item_type = item_type_src.get_text() else: item_type = default img = item.find('img') if img: imgurl = img.get('src') else: imgurl = 'https://upload.wikimedia.org/wikipedia/commons/thumb/9/97/Coursera-Logo_600x600.svg/1200px-Coursera-Logo_600x600.svg.png' course['course_title'] = course_title course['partner'] = partner course['rating_value'] = rating_value course['rating_count'] = rating_count course['enrollment_numbers'] = enrollment_number course['course_difficulty'] = difficulty_level course['type'] = item_type course['imgurl'] = imgurl courses.append(course) return courses
def get_soup(url): url_o = requests.get(url) soup = Bsoup(url_o.text, 'html.parser') return soup
from urllib.request import urlopen as uReq from bs4 import BeautifulSoup as Bsoup #Beautifulsoup would parse the html text in a page while urllib would grab the page itself. #Web scrapping graphics cards of newegg.com(amazon for hardware electronics) target_url = 'https://www.newegg.com/global/ie/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=samsung+s8+plus&ignorear=0&N=-1&isNodeId=1' uClient = uReq( target_url) #open the connection,grab the webpage and download it page_html = uClient.read() uClient.close() #html parsing page_soup = Bsoup( page_html, "html.parser" ) #first argument html page and second argument is what to parse is it html or xml page etc #converting all the phone (samsung 8) models data to a csv file. #next step is to inspect the element of a page,i.e, model and go to the html root/child thats holding the entire container for that phone model. #Once one model is taken care of then we can loop through all other remaining ones,similarly. #grabs each product containers = page_soup.find_all( "div", {"class": "item-container"}) #feeding an object as second parameter filename = "mobile_product.csv" f = open(filename, "w") headers = "brand, Operating System, Price\n" f.write(headers)
None try: file = openFile('r') except IOError: file = openFile('w') file = openFile('r') dwded = file.readlines() board = get('https://www.billboard.com/charts/hot-100').text boardSoup = Bsoup(board, 'lxml') # html parser song_artist = {} url = [] fileTitle = [] stringTitles = [] count = 1 repeat = False amount = raw_input('Song range:\n').split(': ')
import re url = "https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=163834&type=after&isActualPointWriteExecute=false&isMileageSubscriptionAlready=false&isMileageSubscriptionReject=false&page=" url2 = "https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=134963&type=after&isActualPointWriteExecute=false&isMileageSubscriptionAlready=false&isMileageSubscriptionReject=false&page=" url3 = "https://movie.naver.com/movie/point/af/list.nhn?&page=" with open("sentences.csv", "a", encoding="utf-8", newline="") as fp: wr = csv.writer(fp) count = 13501 for i in range(1, 351): res = req.get(url + str(i)) if res.status_code == 200: soup = Bsoup(res.text, "html.parser") span_texts = soup.find_all( attrs={"id": re.compile("_filtered_ment_[0-9]")}) div_scores = soup.find_all(attrs={"class": "star_score"}) text_score_list = list(zip(span_texts, div_scores)) for span_text, div_score in text_score_list: final_text = span_text.text.strip() final_score = int(div_score.text) wr.writerow([count, final_text, final_score]) count += 1 print(i)
def get_target(url, word, html_tag, class_name): document = requests.get(url + word).content soup = Bsoup(document, 'html.parser') result = soup.find_all(html_tag, class_=class_name) return result
os.mkdir('mp3') os.mkdir('mp4') except WindowsError: pass song = raw_input('Song name: ') url = creatUrl(song) author = creatUrl(raw_input('Author name (optional): ')) url = 'https://www.youtube.com/results?search_query=' + url + '+by+' + author + '+lyrics' youtube = Bsoup(get(url).text, 'lxml') # html parser for link in youtube.findAll('a'): if '/watch?v=' in link.get('href'): href = 'https://www.youtube.com' + link.get('href') break mp4 = YouTube(href) stream = mp4.streams.first() stream.download(path + '\\mp4')
from bs4 import BeautifulSoup as Bsoup from urllib.request import urlopen my_url = "https://morvanzhou.github.io/static/scraping/list.html" page_html = urlopen(my_url).read().decode('utf-8') #print(page_html) ## html parsing page_soup = Bsoup(page_html, "html.parser") #print(page_soup) ## use class to narrow search ## function find_all, setting two searching constraints parse_month = page_soup.find_all('li', {"class": "month"}) parse_month = page_soup.find_all('ul', {"class": "jan"}) print(parse_month) for m in parse_month: print(m.get_text())
from urllib import urlopen as uReq from bs4 import BeautifulSoup as Bsoup my_url = 'https://www.redditmetrics.com/top ' #List of the subreddits subreddits = [] #Opening the connection, grabbing the page uClient = uReq(my_url) page_html = uClient.read() uClient.close() #Connection Closed #Parsing the HTML Reddit/Subreddit soup = Bsoup(page_html, 'html.parser') paras = soup.find(attrs={'table'}) count = 0 print paras paras2 = str(paras) file_obj = open("Params", 'w') file_obj.write(paras2) file_obj.close() fileObj = open("Params", 'r') params = fileObj.read() i = 0 listo = []
def scrape_books(links): """Parses book information from Goodreads page""" books_read = 0 while books_read < 200 and len(links) > 0: try: page = requests.get(links.pop(0)) except requests.exceptions.RequestException as error: print(error) return False soup = Bsoup(page.content, "html.parser") book_title = soup.findAll("h1")[0].get_text().strip() book_url = soup.head.link["href"] book_id = (book_url.split('/')[-1]).split('.')[0] isbn13 = soup.find("span", {"itemprop": "isbn"}) if isbn13 is not None: isbn13 = isbn13.get_text().strip() author_info = soup.find(class_="authorName") author_url = author_info["href"] if author_url is not None and author_url not in author_links: author_links.append(author_url) author_name = author_info.find("span").get_text() details_container = soup.find("div", {"id": "details"}) # isbn = details_container.findAll(class_="infoBoxRowItem") page_meta = soup.find(id="bookMeta") rating = page_meta.find("span", { "itemprop": "ratingValue" }).get_text().strip() rating_count = page_meta.find("meta", {"itemprop": "ratingCount"})["content"] review_count = page_meta.find("meta", {"itemprop": "reviewCount"})["content"] image_url = soup.find(id="coverImage")["src"] similar_list = soup.find("div", {"class": "carouselRow"}) similar_books = similar_list.findAll("a") books = [] for b in similar_books: books.append(b.find("img")["alt"].strip().replace(",", " |")) similar_link = book_links.append(b["href"]) if similar_link not in book_links and similar_link is not None: links.append(similar_link) book_object = { "title": book_title, "book_url": book_url, "book_id": book_id, "ISBN": isbn13, "author_url": author_url, "author": author_name, "rating": rating, "rating_count": rating_count, "review_count": review_count, "image_url": image_url, "similar_books": books } books_read += 1 book_info['books'].append(book_object) print(books_read) add_to_json() scrape_author(author_links) return True
def scrape_author(links): """Scrapes a Goodreads page to get information""" authors_read = 0 while authors_read < 50 and len(links) > 0: try: page = requests.get(links.pop(0)) except requests.exceptions.RequestException as error: print(error) return False soup = Bsoup(page.content, "html.parser") author_name = soup.find("h1").get_text().strip() author_url = soup.head.link["href"] author_id = (author_url.split('/')[-1]).split('.')[0] rating = soup.find("span", {"class": "average"}).get_text().strip() rating_count = soup.find("span", { "itemprop": "ratingCount" }).get_text().strip() review_count = soup.find("span", { "itemprop": "reviewCount" }).get_text().strip() image_url = soup.find("img", {"alt": author_name})["src"] similar_container = soup.find(class_="hreview-aggregate") similar_lists = similar_container.findAll("a") # author_books = similar_lists[0]["href"] similar_authors = similar_lists[1]["href"] try: authors_page = requests.get(base_url + similar_authors) except requests.exceptions.RequestException as error: print(error) sys.exit(1) authors_soup = Bsoup(authors_page.content, "html.parser") list_container = authors_soup.findAll( "div", {"class": "listWithDividers__item"}) authors_list = [] authors_links = [] count = 0 for aut in list_container: authors = aut.find("span", {"itemprop": "name"}).get_text().strip() author_link = aut.find("a", {"itemprop": "url"})["href"] if count != 0: authors_list.append(authors) if author_link not in links and author_link is not None: links.append(author_link) count += 1 books_container = soup.findAll("a", {"class": "bookTitle"}) books_list = [] # print(authors_list, "\n", links) count = 0 for bk in books_container: sim_book_name = bk.find("span", {"itemprop": "name"}) if sim_book_name is None: continue if count != 0: books_list.append(sim_book_name.get_text().strip()) count += 1 author_object = { "name": author_name, "author_url": author_url, "author_id": author_id, "rating": rating, "rating_count": rating_count, "review_count": review_count, "image_url": image_url, "related_authors": authors_list, "author_books": books_list } author_info['authors'].append(author_object) authors_read += 1 add_to_json() return True