def __get_youtube_id_fallback(self, track, cancellable): """ Get youtube id (fallback) @param track as Track @param cancellable as Gio.Cancellable @return youtube id as str """ try: from beautifulsoup4 import BeautifulSoup except: print("$ sudo pip3 install beautifulsoup4") return None try: unescaped = "%s %s" % (track.artists[0], track.name) search = GLib.uri_escape_string(unescaped.replace(" ", "+"), None, True) uri = "https://www.youtube.com/results?search_query=%s" % search (status, data) = App().task_helper.load_uri_content_sync(uri, cancellable) if not status: return None html = data.decode("utf-8") soup = BeautifulSoup(html, "html.parser") ytems = [] for link in soup.findAll("a"): href = link.get("href") title = link.get("title") if href is None or title is None: continue if href.startswith("/watch?v="): href = href.replace("/watch?v=", "") ytems.append((href, title)) dic = {} best = self.__BAD_SCORE for (yid, title) in ytems: score = self.__get_youtube_score(title, track.name, track.artists[0], track.album.name) if score < best: best = score elif score == best: continue # Keep first result dic[score] = yid # Return url from first dic item if best == self.__BAD_SCORE: return None else: return dic[best] except Exception as e: Logger.warning("YouTubeHelper::__get_youtube_id_fallback(): %s", e) self.__fallback = True return None
def parse_url_to_html(self, url): ''' 使用soup.find_all函数找到正文标签,然后把正文部分 的内容保存到a.html文件中 ''' response = requests(url) soup = BeautifulSoup(response.content, 'html.parse') body = soup.find_all(class_='x-wiki-content')[0] html = str(body) with open('a.html', 'wb') as f: f.writen(html)
def get_url_list(self, url): ''' 获取所有URL目录列表 ''' response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parse') menu_tag = soup.find_all(class_='uk_nav uk-nav-side')[1] urls = [] for li in menu_tag.find_all('li'): url = 'exampleurl' + li.a.get('href') urls.append(url) return urls
def translatePage( pageurl=DEFAULT_TEST_PAGE, outputpath=DEFAULT_OUTPUT_PATH, outfile=DEFAULT_OUTFILE_NAME, titleid=WIKI_TITLE_ID, bodyid=WIKI_BODY_ID, percent=100): """ Loads a given URL and scrambles the page's header and bodytext contents. Defaults are set for Wikipedia pages. Keywords: pageurl - URL for html page to load outputpath - Output file directory path (default: current directory) outfile - Output file name (should end in ".html" for easy browser display) titleid - html tag id for header text bodyid - html tag id for body text percent - [UNIMPLEMENTED] Percentage of words (0-100) to gibberize Returns the output file path """ # Initialize variables for scope pagetitle = "" bodytext = "" wordlist = {} # Load the page htmlobj = urllib2.urlopen(pageurl).read() titleStrainer = SoupStrainer(id=titleid) bodyStrainer = SoupStrainer(id=bodyid) # This is a hack to find baseurl for filling for absolute paths # DANGER: Requires properly-formatted url # DANGER: Performs no error checking for proper url baseurl = '/'.join(pageurl.split('/')[:3]) # Make all links absolute to passed url so pages load css/etc properly # NOTE: Order is important so the // are done before the / htmlobj = re.sub('href="//', 'href="http://', htmlobj) htmlobj = re.sub('href="/', 'href="%s/' % baseurl, htmlobj) # Make image paths absolute so images load properly htmlobj = re.sub('src="//', 'src="http://', htmlobj) htmlobj = re.sub('srcset="//', 'srcset="http://', htmlobj) soup = BeautifulSoup(htmlobj, 'html.parser') # Find the "firstHeading" h1 so we can get the title as a random seed # h1 = BeautifulSoup(htmlobj, 'html.parser', parse_only=titleStrainer) h1 = soup.find_all(titleStrainer)[0] # Perform error checking if len(h1.contents) > 0: pagetitle = str(h1.get_text()) else: raise ValueError("Header contents length error: len %i" % (len(h1.contents))) # COOL PART: Set the random seed to the page title so the # article 'translation' will be reproducible random.seed(pagetitle) # Now get the body content div = BeautifulSoup(htmlobj, 'html.parser', parse_only=bodyStrainer) # Get the body text try: bodytext = div.get_text().encode('utf-8') # bodytext = soup.div.get_text().encode('utf-8') except Exception, e: logging.error( "Unexpected error: {}\nDiv contained {} elements.".format( str(e), len(div.contents) ) ) raise e
from selenium import webdriver from beautifulsoup4 import BeautifulSoup import pandas as pd driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver") products = [] prices = [] ratings = [] driver.get( "https://www.flipkart.com/gaming-laptops-store?otracker=nmenu_sub_Electronics_0_Gaming%20Laptops&otracker=nmenu_sub_Electronics_0_Gaming%20Laptops" ) content = driver.page_source soup = BeautifulSoup(content) for a in soup.findAll('a', href=True, attrs={'class': '_2cLu-l'}): name = a.find('div', attrs={'class': '_2cLu-l'}) price = a.find('div', attrs={'class': '_1vC4OE'}) ratings = a.find('div', attrs={'class': 'hGSR34'}) products.append(name.text) price.append(price.text) ratings.append(rating.text)
open_bbc_page = requests.get(main_url).json() # getting all articles in a string article article = open_bbc_page["articles"] # empty list which will # contain all trending news bbc_news = [] for ar in article: bbc_news.append(ar["title"]) # GEtting news from Times of India toi_r = requests.get("https://timesofindia.indiatimes.com/briefs") toi_soup = BeautifulSoup(toi_r.content, 'html5lib') toi_headings = toi_soup.find_all('h2') toi_headings = toi_headings[0:-13] # removing footers toi_news = [] for th in toi_headings: if len(th.text) < 25: continue toi_news.append(th.text) #Getting news from Hindustan times ht_r = requests.get("https://www.hindustantimes.com/india-news/")
"""In-class activities for 09-25.""" print('Parsing HTML (for reals this time)') # Regexes are not ideal HTML parsers # In order to parse HTML, you need an HTML parser, such as beautifulsoup print('https://www.crummy.com/software/BeautifulSoup/bs4/doc/#quick-start') input('Press [enter] to continue') ############################################################################### from beautifulsoup4 import BeautifulSoup import re # open a previously saved HTML file and soupify it with open('Healthcare_in_Canada-wikipedia.html') as healthy_file: soup = BeautifulSoup(healthy_file, 'html5lib') # html5lib is the parser print("Let's extract all of the links on this page...") print('\tprinting just the first 10...') for link in soup.find_all('a'): print(link.get('href')) print() input('Press [enter] to continue') print("Let's find all the headers with <h2> tags in this article....") h2s = soup.find_all('h2') for tag in h2s: print(tag.span.string) print() input('Press [enter] to continue')
def translatePage(pageurl=DEFAULT_TEST_PAGE, outputpath=DEFAULT_OUTPUT_PATH, outfile=DEFAULT_OUTFILE_NAME, titleid=WIKI_TITLE_ID, bodyid=WIKI_BODY_ID, percent=100): """ Loads a given URL and scrambles the page's header and bodytext contents. Defaults are set for Wikipedia pages. Keywords: pageurl - URL for html page to load outputpath - Output file directory path (default: current directory) outfile - Output file name (should end in ".html" for easy browser display) titleid - html tag id for header text bodyid - html tag id for body text percent - [UNIMPLEMENTED] Percentage of words (0-100) to gibberize Returns the output file path """ # Initialize variables for scope pagetitle = "" bodytext = "" wordlist = {} # Load the page htmlobj = urllib2.urlopen(pageurl).read() titleStrainer = SoupStrainer(id=titleid) bodyStrainer = SoupStrainer(id=bodyid) # This is a hack to find baseurl for filling for absolute paths # DANGER: Requires properly-formatted url # DANGER: Performs no error checking for proper url baseurl = '/'.join(pageurl.split('/')[:3]) # Make all links absolute to passed url so pages load css/etc properly # NOTE: Order is important so the // are done before the / htmlobj = re.sub('href="//', 'href="http://', htmlobj) htmlobj = re.sub('href="/', 'href="%s/' % baseurl, htmlobj) # Make image paths absolute so images load properly htmlobj = re.sub('src="//', 'src="http://', htmlobj) htmlobj = re.sub('srcset="//', 'srcset="http://', htmlobj) soup = BeautifulSoup(htmlobj, 'html.parser') # Find the "firstHeading" h1 so we can get the title as a random seed # h1 = BeautifulSoup(htmlobj, 'html.parser', parse_only=titleStrainer) h1 = soup.find_all(titleStrainer)[0] # Perform error checking if len(h1.contents) > 0: pagetitle = str(h1.get_text()) else: raise ValueError("Header contents length error: len %i" % (len(h1.contents))) # COOL PART: Set the random seed to the page title so the # article 'translation' will be reproducible random.seed(pagetitle) # Now get the body content div = BeautifulSoup(htmlobj, 'html.parser', parse_only=bodyStrainer) # Get the body text try: bodytext = div.get_text().encode('utf-8') # bodytext = soup.div.get_text().encode('utf-8') except Exception, e: logging.error( "Unexpected error: {}\nDiv contained {} elements.".format( str(e), len(div.contents))) raise e
url = "http://sfbay.craigslist.org/web/" from beautifulsoup4 import BeautifulSoup import requests r = requests.get(url) soup = BeautifulSoup(r.text) content = soup.find('div', attrs={'class': 'content'}) p_elements = content.findAll('p', attrs = {'class':'row'}) print p_elements
def get_soup(url,header): return BeautifulSoup(urllib3.urlopen(urllib3.Request(url,headers=header)),'html.parser')