Python BeautifulSoup示例，beautifulsoup4.BeautifulSoup Python示例

示例#1

0

显示文件

    def __get_youtube_id_fallback(self, track, cancellable):
        """
            Get youtube id (fallback)
            @param track as Track
            @param cancellable as Gio.Cancellable
            @return youtube id as str
        """
        try:
            from beautifulsoup4 import BeautifulSoup
        except:
            print("$ sudo pip3 install beautifulsoup4")
            return None
        try:
            unescaped = "%s %s" % (track.artists[0], track.name)
            search = GLib.uri_escape_string(unescaped.replace(" ", "+"), None,
                                            True)
            uri = "https://www.youtube.com/results?search_query=%s" % search
            (status,
             data) = App().task_helper.load_uri_content_sync(uri, cancellable)
            if not status:
                return None

            html = data.decode("utf-8")
            soup = BeautifulSoup(html, "html.parser")
            ytems = []
            for link in soup.findAll("a"):
                href = link.get("href")
                title = link.get("title")
                if href is None or title is None:
                    continue
                if href.startswith("/watch?v="):
                    href = href.replace("/watch?v=", "")
                    ytems.append((href, title))
            dic = {}
            best = self.__BAD_SCORE
            for (yid, title) in ytems:
                score = self.__get_youtube_score(title, track.name,
                                                 track.artists[0],
                                                 track.album.name)
                if score < best:
                    best = score
                elif score == best:
                    continue  # Keep first result
                dic[score] = yid
            # Return url from first dic item
            if best == self.__BAD_SCORE:
                return None
            else:
                return dic[best]
        except Exception as e:
            Logger.warning("YouTubeHelper::__get_youtube_id_fallback(): %s", e)
            self.__fallback = True
        return None

示例#2

0

显示文件

    def parse_url_to_html(self, url):
        '''
            使用soup.find_all函数找到正文标签,然后把正文部分
            的内容保存到a.html文件中
        '''
        response = requests(url)
        soup = BeautifulSoup(response.content, 'html.parse')
        body = soup.find_all(class_='x-wiki-content')[0]
        html = str(body)

        with open('a.html', 'wb') as f:
            f.writen(html)

示例#3

0

显示文件

    def get_url_list(self, url):
        '''
            获取所有URL目录列表
        '''
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parse')
        menu_tag = soup.find_all(class_='uk_nav uk-nav-side')[1]
        urls = []

        for li in menu_tag.find_all('li'):
            url = 'exampleurl' + li.a.get('href')
            urls.append(url)
        return urls

示例#4

0

显示文件

文件： pagetranslate.py 项目： hillaryj/language-experiments

def translatePage(
        pageurl=DEFAULT_TEST_PAGE,
        outputpath=DEFAULT_OUTPUT_PATH,
        outfile=DEFAULT_OUTFILE_NAME,
        titleid=WIKI_TITLE_ID,
        bodyid=WIKI_BODY_ID,
        percent=100):
    """
    Loads a given URL and scrambles the page's header and bodytext contents.
    Defaults are set for Wikipedia pages.

    Keywords:
    pageurl - URL for html page to load
    outputpath - Output file directory path (default: current directory)
    outfile - Output file name (should end in ".html" for easy browser display)
    titleid - html tag id for header text
    bodyid - html tag id for body text
    percent - [UNIMPLEMENTED] Percentage of words (0-100) to gibberize

    Returns the output file path
    """
    # Initialize variables for scope
    pagetitle = ""
    bodytext = ""
    wordlist = {}

    # Load the page
    htmlobj = urllib2.urlopen(pageurl).read()
    titleStrainer = SoupStrainer(id=titleid)
    bodyStrainer = SoupStrainer(id=bodyid)

    # This is a hack to find baseurl for filling for absolute paths
    # DANGER: Requires properly-formatted url
    # DANGER: Performs no error checking for proper url
    baseurl = '/'.join(pageurl.split('/')[:3])

    # Make all links absolute to passed url so pages load css/etc properly
    # NOTE: Order is important so the // are done before the /
    htmlobj = re.sub('href="//', 'href="http://', htmlobj)
    htmlobj = re.sub('href="/', 'href="%s/' % baseurl, htmlobj)
    # Make image paths absolute so images load properly
    htmlobj = re.sub('src="//', 'src="http://', htmlobj)
    htmlobj = re.sub('srcset="//', 'srcset="http://', htmlobj)

    soup = BeautifulSoup(htmlobj, 'html.parser')

    # Find the "firstHeading" h1 so we can get the title as a random seed
    # h1 = BeautifulSoup(htmlobj, 'html.parser', parse_only=titleStrainer)
    h1 = soup.find_all(titleStrainer)[0]
    # Perform error checking
    if len(h1.contents) > 0:
        pagetitle = str(h1.get_text())
    else:
        raise ValueError("Header contents length error: len %i"
                         % (len(h1.contents)))

    # COOL PART: Set the random seed to the page title so the
    # article 'translation' will be reproducible
    random.seed(pagetitle)

    # Now get the body content
    div = BeautifulSoup(htmlobj, 'html.parser', parse_only=bodyStrainer)

    # Get the body text
    try:
        bodytext = div.get_text().encode('utf-8')
        # bodytext = soup.div.get_text().encode('utf-8')
    except Exception, e:
        logging.error(
            "Unexpected error: {}\nDiv contained {} elements.".format(
                str(e),
                len(div.contents)
            )
        )
        raise e

示例#5

0

显示文件

文件： first.py 项目： beingvikasagain/firstsite

from selenium import webdriver
from beautifulsoup4 import BeautifulSoup
import pandas as pd

driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver")

products = []
prices = []
ratings = []
driver.get(
    "https://www.flipkart.com/gaming-laptops-store?otracker=nmenu_sub_Electronics_0_Gaming%20Laptops&otracker=nmenu_sub_Electronics_0_Gaming%20Laptops"
)

content = driver.page_source
soup = BeautifulSoup(content)

for a in soup.findAll('a', href=True, attrs={'class': '_2cLu-l'}):
    name = a.find('div', attrs={'class': '_2cLu-l'})
    price = a.find('div', attrs={'class': '_1vC4OE'})
    ratings = a.find('div', attrs={'class': 'hGSR34'})
    products.append(name.text)
    price.append(price.text)
    ratings.append(rating.text)

示例#6

0

显示文件

open_bbc_page = requests.get(main_url).json()

# getting all articles in a string article
article = open_bbc_page["articles"]

# empty list which will
# contain all trending news
bbc_news = []

for ar in article:
    bbc_news.append(ar["title"])

# GEtting news from Times of India

toi_r = requests.get("https://timesofindia.indiatimes.com/briefs")
toi_soup = BeautifulSoup(toi_r.content, 'html5lib')

toi_headings = toi_soup.find_all('h2')

toi_headings = toi_headings[0:-13]  # removing footers

toi_news = []

for th in toi_headings:
    if len(th.text) < 25:
        continue
    toi_news.append(th.text)

#Getting news from Hindustan times

ht_r = requests.get("https://www.hindustantimes.com/india-news/")

示例#7

0

显示文件

文件： supa_09_25.py 项目： paulrsvg/F18_DIGHT360

"""In-class activities for 09-25."""

print('Parsing HTML (for reals this time)')

# Regexes are not ideal HTML parsers
# In order to parse HTML, you need an HTML parser, such as beautifulsoup
print('https://www.crummy.com/software/BeautifulSoup/bs4/doc/#quick-start')
input('Press [enter] to continue')

###############################################################################
from beautifulsoup4 import BeautifulSoup
import re

# open a previously saved HTML file and soupify it
with open('Healthcare_in_Canada-wikipedia.html') as healthy_file:
    soup = BeautifulSoup(healthy_file, 'html5lib')  # html5lib is the parser

print("Let's extract all of the links on this page...")
print('\tprinting just the first 10...')
for link in soup.find_all('a'):
    print(link.get('href'))
print()
input('Press [enter] to continue')

print("Let's find all the headers with <h2> tags in this article....")
h2s = soup.find_all('h2')
for tag in h2s:
    print(tag.span.string)
print()
input('Press [enter] to continue')

示例#8

0

显示文件

文件： pagetranslate.py 项目： hillaryj/language-experiments

def translatePage(pageurl=DEFAULT_TEST_PAGE,
                  outputpath=DEFAULT_OUTPUT_PATH,
                  outfile=DEFAULT_OUTFILE_NAME,
                  titleid=WIKI_TITLE_ID,
                  bodyid=WIKI_BODY_ID,
                  percent=100):
    """
    Loads a given URL and scrambles the page's header and bodytext contents.
    Defaults are set for Wikipedia pages.

    Keywords:
    pageurl - URL for html page to load
    outputpath - Output file directory path (default: current directory)
    outfile - Output file name (should end in ".html" for easy browser display)
    titleid - html tag id for header text
    bodyid - html tag id for body text
    percent - [UNIMPLEMENTED] Percentage of words (0-100) to gibberize

    Returns the output file path
    """
    # Initialize variables for scope
    pagetitle = ""
    bodytext = ""
    wordlist = {}

    # Load the page
    htmlobj = urllib2.urlopen(pageurl).read()
    titleStrainer = SoupStrainer(id=titleid)
    bodyStrainer = SoupStrainer(id=bodyid)

    # This is a hack to find baseurl for filling for absolute paths
    # DANGER: Requires properly-formatted url
    # DANGER: Performs no error checking for proper url
    baseurl = '/'.join(pageurl.split('/')[:3])

    # Make all links absolute to passed url so pages load css/etc properly
    # NOTE: Order is important so the // are done before the /
    htmlobj = re.sub('href="//', 'href="http://', htmlobj)
    htmlobj = re.sub('href="/', 'href="%s/' % baseurl, htmlobj)
    # Make image paths absolute so images load properly
    htmlobj = re.sub('src="//', 'src="http://', htmlobj)
    htmlobj = re.sub('srcset="//', 'srcset="http://', htmlobj)

    soup = BeautifulSoup(htmlobj, 'html.parser')

    # Find the "firstHeading" h1 so we can get the title as a random seed
    # h1 = BeautifulSoup(htmlobj, 'html.parser', parse_only=titleStrainer)
    h1 = soup.find_all(titleStrainer)[0]
    # Perform error checking
    if len(h1.contents) > 0:
        pagetitle = str(h1.get_text())
    else:
        raise ValueError("Header contents length error: len %i" %
                         (len(h1.contents)))

    # COOL PART: Set the random seed to the page title so the
    # article 'translation' will be reproducible
    random.seed(pagetitle)

    # Now get the body content
    div = BeautifulSoup(htmlobj, 'html.parser', parse_only=bodyStrainer)

    # Get the body text
    try:
        bodytext = div.get_text().encode('utf-8')
        # bodytext = soup.div.get_text().encode('utf-8')
    except Exception, e:
        logging.error(
            "Unexpected error: {}\nDiv contained {} elements.".format(
                str(e), len(div.contents)))
        raise e

示例#9

0

显示文件

文件： bugfix.py 项目： imoran21/ocean_lyfe

url = "http://sfbay.craigslist.org/web/"
from beautifulsoup4 import BeautifulSoup
import requests
r = requests.get(url)

soup = BeautifulSoup(r.text)
content = soup.find('div', attrs={'class': 'content'})


p_elements = content.findAll('p', attrs = {'class':'row'})

print p_elements

示例#10

0

显示文件

def get_soup(url,header):
    return BeautifulSoup(urllib3.urlopen(urllib3.Request(url,headers=header)),'html.parser')