Python BeautifulSoup.BeautifulSoup примеры использования

Язык программирования: Python

Пространство имен/Пакет: beautifulsoup

Класс/Тип: BeautifulSoup

Метод/Функция: BeautifulSoup

Примеров на hotexamples.com: 11

Python BeautifulSoup.BeautifulSoup - 11 примеров найдено. Это лучшие примеры Python кода для beautifulsoup.BeautifulSoup.BeautifulSoup, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

BeautifulSoup(11)

findAll(5)

prettify(4)

find(2)

renderContents(1)

Пример #1

Показать файл

def sanitizeHtml(value, base_url=None):
    value = value.replace('<div>','').replace('</div>','').replace('<p>','').replace('</p>','').replace('<span>','').replace('</span>','')
    rjs = r'[\s]*(&#x.{1,7})?'.join(list('javascript:'))
    rvb = r'[\s]*(&#x.{1,7})?'.join(list('vbscript:'))
    re_scripts = re.compile('(%s)|(%s)' % (rjs, rvb), re.IGNORECASE)
    validTags = 'br i em strong ul ol li u b a h1 h2 h3 blockquote'.split()
    validAttrs = 'href'.split()
    urlAttrs = 'href src'.split() # Attributes which should have a URL
    soup = BeautifulSoup(value)
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        # Get rid of comments
        comment.extract()
    for tag in soup.findAll(True):
        if tag.name not in validTags:
            tag.extract()
        attrs = tag.attrs
        tag.attrs = []
        for attr, val in attrs:
            if attr in validAttrs:
                val = re_scripts.sub('', val) # Remove scripts (vbs & js)
                if attr in urlAttrs:
                    val = urljoin(base_url, val) # Calculate the absolute url
                tag.attrs.append((attr, val))

    return soup.renderContents().decode('utf8')

Пример #2

Показать файл

Файл: codes.py Проект: nategallagher/SlateGoogleSheets

 def _parse_tabela(self, html):
     soup = BeautifulSoup(html)
     linhas = soup.findAll(
         'tr',
         attrs={
             'onclick':
             re.compile(r"javascript:detalharCep\('\d+','\d+'\);")
         })
     return [self._parse_linha_tabela(linha) for linha in linhas]

Пример #3

Показать файл

    def Get6Songs(self, event):
        #get current song and try and see what happens
        #http://www.pandora.com/music/song/interpol/obstacle+1

        val = self.lc_songdora_results.GetFirstSelected()
        if val >= 0:
            pandora_url = self.lc_songdora_results.GetItem(val, 3).GetText()
        else:
            partist = self.parent.current_song.artist.replace(' ', '+')
            ptrack = self.parent.current_song.song.replace(' ', '+')
            pandora_url = PANDORA_SONG_URL + urllib2.quote(
                partist.encode('utf8') + '/' + ptrack.encode('utf8'))
            self.st_songdora_song.SetLabel(self.parent.current_song.artist +
                                           ' - ' +
                                           self.parent.current_song.song)
        #print string_tools.unescape(pandora_url)
        #pandora_url = urllib2.quote(pandora_url.encode('utf8'))
        page = urllib2.urlopen(pandora_url)

        print pandora_url

        soup = BeautifulSoup(page)
        counter = 0
        for songs in soup('span', id="similar_song"):  #span id="similar_song"
            t = songs.contents[1]
            track = str(t).split('"')[3]
            link = PANDORA_SONG_URL + str(t).split('"')[1]
            a = songs.contents[6]
            soupa = BeautifulSoup(str(a))
            artist = soupa.a.string

            b = songs.contents[11]
            soupb = BeautifulSoup(str(b))
            album = soupb.a.string

            self.lc_songdora_results.InsertStringItem(counter, artist)
            self.lc_songdora_results.SetStringItem(counter, 1, track)
            self.lc_songdora_results.SetStringItem(counter, 2, album)
            self.lc_songdora_results.SetStringItem(counter, 3, link)
            counter = counter + 1

Пример #4

Показать файл

Файл: codes.py Проект: nategallagher/SlateGoogleSheets

    def _parse_detalhe(self, html):
        soup = BeautifulSoup(html.decode('ISO-8859-1'))

        value_cells = soup.findAll('td', attrs={'class': 'value'})
        values = [cell.firstText(text=True) for cell in value_cells]
        localidade, uf = values[2].split('/')
        values_dict = {
            'Logradouro': values[0],
            'Bairro': values[1],
            'Localidade': localidade,
            'UF': uf,
            'CEP': values[3]
        }
        return values_dict

Пример #5

Показать файл

Файл: tracking.py Проект: nategallagher/SlateGoogleSheets

class CorreiosWebsiteScraper(object):
    def __init__(self, http_client=urllib2):
        self.url = 'http://websro.correios.com.br/sro_bin/txect01$.QueryList?P_ITEMCODE=&P_LINGUA=001&P_TESTE=&P_TIPO=001&P_COD_UNI='
        self.http_client = http_client

    def get_encomenda_info(self, numero):
        request = self.http_client.urlopen('%s%s' % (self.url, numero))
        html = request.read()
        request.close()
        if html:
            encomenda = Encomenda(numero)
            [
                encomenda.adicionar_status(status)
                for status in self._get_all_status_from_html(html)
            ]
            return encomenda

    def _get_all_status_from_html(self, html):
        html_info = re.search('.*(<table.*</TABLE>).*', html, re.S)
        try:
            table = html_info.group(1)
        except AttributeError, e:
            return [-1]

        soup = BeautifulSoup(table)

        status = []
        count = 0
        for tr in soup.table:
            if count > 4 and str(tr).strip() != '':
                if re.match(r'\d{2}\/\d{2}\/\d{4} \d{2}:\d{2}',
                            tr.contents[0].string):
                    status.append(
                        Status(data=unicode(tr.contents[0].string),
                               local=unicode(tr.contents[1].string),
                               situacao=unicode(tr.contents[2].font.string)))
                else:
                    status[len(status) - 1].detalhes = unicode(
                        tr.contents[0].string)

            count = count + 1

        return status

Пример #6

Показать файл

Файл: base.py Проект: zhoujh/jaikuengine

 def assertGetLink(self, response, link_class, link_no,
                   of_count=-1, msg=''):
   """Tries to find an anchor element with the given class from the response.
   Checks that there are of_count total links of that class
   (unless of_count==-1). Gets the page from that link.
   """
   self.assertWellformed(response)
   parsed = BeautifulSoup.BeautifulSoup(response.content)
   found = parsed.findAll('a', attrs = { 'class': link_class})
   anchors = [a for a in found]
   if of_count > -1:
     self.assertEqual(len(anchors), of_count, msg)
   a = anchors[link_no]
   href = a['href']
   # 2.4 sgmllib/HTMLParser doesn't decode HTML entities, this
   # fixes the query parameter separator.
   # TODO(mikie): how do we properly detect the sgmllib version?
   if int(sys.version.split(' ')[0].split('.')[1]) < 5:
     href = href.replace('&amp;', '&')
   args = util.href_to_queryparam_dict(href)
   args['confirm'] = 1
   url = response.request['PATH_INFO']
   return self.client.get(url, args)

Пример #7

Показать файл

Файл: got.py Проект: carvihi/ejercicios-2

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import urllib

from beautifulsoup import BeautifulSoup

wiki_url = 'https://en.wikipedia.org/wiki/Game_of_Thrones'
wiki_html = urllib.urlopen(wiki_url).read()
wiki_content = BeautifulSoup(wiki_html)

seasons_table = wiki_content.find('table', attrs={'class': 'wikitable'})
seasons = seasons_table.findAll(
    'a',
    attrs={'href': re.compile('\/wiki\/Game_of_Thrones_\(season_?[0-9]+\)')})

views = 0

for season in seasons:
    season_url = 'https://en.wikipedia.org' + season['href']
    season_html = urllib.urlopen(season_url).read()
    season_content = BeautifulSoup(season_html)

    episodes_table = season_content.find(
        'table', attrs={'class': 'wikitable plainrowheaders wikiepisodetable'})

    if episodes_table:
        episode_rows = episodes_table.findAll('tr', attrs={'class': 'vevent'})

        if episode_rows:

Пример #8

Показать файл

 def run(self):
     in_soup = BeautifulSoup(self.markup_source)
     out_soup = BeautifulSoup()
     env = {}
     return self.__walk(in_soup, out_soup, env).prettify()

Пример #9

Показать файл

Файл: scrapebook.py Проект: carvihi/ejercicios-2

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from beautifulsoup import BeautifulSoup
from urllib2 import urlopen

url = "https://scrapebook22.appspot.com/"
response = urlopen(url).read()
soup = BeautifulSoup(response)

print soup.html.head.title.string

for link in soup.findAll("a"):
    if link.string == "See full profile":
        person_url = "https://scrapebook22.appspot.com" + link["href"]
        person_html = urlopen(person_url).read()
        person_soup = BeautifulSoup(person_html)
        email = person_soup.find("span", attrs={"class": "email"}).string
        name = person_soup.find("div", attrs={"class": "col-md-8"}).h1.string
        city = person_soup.find("span", attrs={"data-city": True}).string
        print name + "," + email + "," + city

        csv_file = open("list.csv", "w")
        csv_file.write(name + "," + email + "," + city + "\n")

        csv_file.close()

Пример #10

Показать файл

 def soup(self, data):
     return BeautifulSoup(data)

Пример #11

Показать файл

def get_polls_all(fd, limit=None):
    """
    Scrape an file like object and return a list in which each element represents
    information about a poll
    """
    soup = BeautifulSoup(fd)

    tables = soup.findAll("table", {"class": "wikitable sortable"})

    if len(
            tables
    ) > 1:  # TODO This can actually be handled checking for info inside each table
        raise Exception("Too many tables found")

    all_trs = tables[0].findAll("tr")
    tr_lines = all_trs[1:]

    # Find out parties names
    # All names are on the first line of the table
    # Search for font tags
    span_tags = all_trs[0].findAllNext("span")
    parties_names = [f.string for f in span_tags]

    all_polls = []
    # TODO Further asserts/verifies are needed to make sure we can use this table
    for poll in tr_lines:
        if limit and len(all_polls) >= limit:
            _log.debug("Stopped parsing. Already parsed until limit = %s" %
                       limit)
            break

        cells = poll.findAll("td")

        if len(cells) != 9:
            _log.info(
                "Stopping parsing. Line does not have 9 columns. We need 8 columns to parse stats."
            )
            break

        cells_t = [
            _clean_data(c.string) if c.string is not None else None
            for c in cells
        ]

        a_tag = cells[1].find('a')
        href = dict(a_tag.attrs)["href"]
        institute = a_tag.string

        current_poll_data = {
            "date": _clean_data(
                cells_t[0]
            ),  # We actually handle this OK, but clients will probably have problems
            "source": {
                "href": href,
                "name": institute,
            },
            "parties": {}
        }

        current_poll_data["parties"].update(
            (party, cells_t[n])
            for party, n in izip(parties_names, range(2, 8)))

        all_polls.append(current_poll_data)

        _log.info("Parsed polls for %s" % cells_t[0])

    return all_polls