Пример #1
0
def sanitizeHtml(value, base_url=None):
    value = value.replace('<div>','').replace('</div>','').replace('<p>','').replace('</p>','').replace('<span>','').replace('</span>','')
    rjs = r'[\s]*(&#x.{1,7})?'.join(list('javascript:'))
    rvb = r'[\s]*(&#x.{1,7})?'.join(list('vbscript:'))
    re_scripts = re.compile('(%s)|(%s)' % (rjs, rvb), re.IGNORECASE)
    validTags = 'br i em strong ul ol li u b a h1 h2 h3 blockquote'.split()
    validAttrs = 'href'.split()
    urlAttrs = 'href src'.split() # Attributes which should have a URL
    soup = BeautifulSoup(value)
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        # Get rid of comments
        comment.extract()
    for tag in soup.findAll(True):
        if tag.name not in validTags:
            tag.extract()
        attrs = tag.attrs
        tag.attrs = []
        for attr, val in attrs:
            if attr in validAttrs:
                val = re_scripts.sub('', val) # Remove scripts (vbs & js)
                if attr in urlAttrs:
                    val = urljoin(base_url, val) # Calculate the absolute url
                tag.attrs.append((attr, val))

    return soup.renderContents().decode('utf8')
Пример #2
0
 def _parse_tabela(self, html):
     soup = BeautifulSoup(html)
     linhas = soup.findAll(
         'tr',
         attrs={
             'onclick':
             re.compile(r"javascript:detalharCep\('\d+','\d+'\);")
         })
     return [self._parse_linha_tabela(linha) for linha in linhas]
Пример #3
0
    def Get6Songs(self, event):
        #get current song and try and see what happens
        #http://www.pandora.com/music/song/interpol/obstacle+1

        val = self.lc_songdora_results.GetFirstSelected()
        if val >= 0:
            pandora_url = self.lc_songdora_results.GetItem(val, 3).GetText()
        else:
            partist = self.parent.current_song.artist.replace(' ', '+')
            ptrack = self.parent.current_song.song.replace(' ', '+')
            pandora_url = PANDORA_SONG_URL + urllib2.quote(
                partist.encode('utf8') + '/' + ptrack.encode('utf8'))
            self.st_songdora_song.SetLabel(self.parent.current_song.artist +
                                           ' - ' +
                                           self.parent.current_song.song)
        #print string_tools.unescape(pandora_url)
        #pandora_url = urllib2.quote(pandora_url.encode('utf8'))
        page = urllib2.urlopen(pandora_url)

        print pandora_url

        soup = BeautifulSoup(page)
        counter = 0
        for songs in soup('span', id="similar_song"):  #span id="similar_song"
            t = songs.contents[1]
            track = str(t).split('"')[3]
            link = PANDORA_SONG_URL + str(t).split('"')[1]
            a = songs.contents[6]
            soupa = BeautifulSoup(str(a))
            artist = soupa.a.string

            b = songs.contents[11]
            soupb = BeautifulSoup(str(b))
            album = soupb.a.string

            self.lc_songdora_results.InsertStringItem(counter, artist)
            self.lc_songdora_results.SetStringItem(counter, 1, track)
            self.lc_songdora_results.SetStringItem(counter, 2, album)
            self.lc_songdora_results.SetStringItem(counter, 3, link)
            counter = counter + 1
Пример #4
0
    def _parse_detalhe(self, html):
        soup = BeautifulSoup(html.decode('ISO-8859-1'))

        value_cells = soup.findAll('td', attrs={'class': 'value'})
        values = [cell.firstText(text=True) for cell in value_cells]
        localidade, uf = values[2].split('/')
        values_dict = {
            'Logradouro': values[0],
            'Bairro': values[1],
            'Localidade': localidade,
            'UF': uf,
            'CEP': values[3]
        }
        return values_dict
Пример #5
0
class CorreiosWebsiteScraper(object):
    def __init__(self, http_client=urllib2):
        self.url = 'http://websro.correios.com.br/sro_bin/txect01$.QueryList?P_ITEMCODE=&P_LINGUA=001&P_TESTE=&P_TIPO=001&P_COD_UNI='
        self.http_client = http_client

    def get_encomenda_info(self, numero):
        request = self.http_client.urlopen('%s%s' % (self.url, numero))
        html = request.read()
        request.close()
        if html:
            encomenda = Encomenda(numero)
            [
                encomenda.adicionar_status(status)
                for status in self._get_all_status_from_html(html)
            ]
            return encomenda

    def _get_all_status_from_html(self, html):
        html_info = re.search('.*(<table.*</TABLE>).*', html, re.S)
        try:
            table = html_info.group(1)
        except AttributeError, e:
            return [-1]

        soup = BeautifulSoup(table)

        status = []
        count = 0
        for tr in soup.table:
            if count > 4 and str(tr).strip() != '':
                if re.match(r'\d{2}\/\d{2}\/\d{4} \d{2}:\d{2}',
                            tr.contents[0].string):
                    status.append(
                        Status(data=unicode(tr.contents[0].string),
                               local=unicode(tr.contents[1].string),
                               situacao=unicode(tr.contents[2].font.string)))
                else:
                    status[len(status) - 1].detalhes = unicode(
                        tr.contents[0].string)

            count = count + 1

        return status
Пример #6
0
 def assertGetLink(self, response, link_class, link_no,
                   of_count=-1, msg=''):
   """Tries to find an anchor element with the given class from the response.
   Checks that there are of_count total links of that class
   (unless of_count==-1). Gets the page from that link.
   """
   self.assertWellformed(response)
   parsed = BeautifulSoup.BeautifulSoup(response.content)
   found = parsed.findAll('a', attrs = { 'class': link_class})
   anchors = [a for a in found]
   if of_count > -1:
     self.assertEqual(len(anchors), of_count, msg)
   a = anchors[link_no]
   href = a['href']
   # 2.4 sgmllib/HTMLParser doesn't decode HTML entities, this
   # fixes the query parameter separator.
   # TODO(mikie): how do we properly detect the sgmllib version?
   if int(sys.version.split(' ')[0].split('.')[1]) < 5:
     href = href.replace('&amp;', '&')
   args = util.href_to_queryparam_dict(href)
   args['confirm'] = 1
   url = response.request['PATH_INFO']
   return self.client.get(url, args)
Пример #7
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import urllib

from beautifulsoup import BeautifulSoup

wiki_url = 'https://en.wikipedia.org/wiki/Game_of_Thrones'
wiki_html = urllib.urlopen(wiki_url).read()
wiki_content = BeautifulSoup(wiki_html)

seasons_table = wiki_content.find('table', attrs={'class': 'wikitable'})
seasons = seasons_table.findAll(
    'a',
    attrs={'href': re.compile('\/wiki\/Game_of_Thrones_\(season_?[0-9]+\)')})

views = 0

for season in seasons:
    season_url = 'https://en.wikipedia.org' + season['href']
    season_html = urllib.urlopen(season_url).read()
    season_content = BeautifulSoup(season_html)

    episodes_table = season_content.find(
        'table', attrs={'class': 'wikitable plainrowheaders wikiepisodetable'})

    if episodes_table:
        episode_rows = episodes_table.findAll('tr', attrs={'class': 'vevent'})

        if episode_rows:
Пример #8
0
 def run(self):
     in_soup = BeautifulSoup(self.markup_source)
     out_soup = BeautifulSoup()
     env = {}
     return self.__walk(in_soup, out_soup, env).prettify()
Пример #9
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from beautifulsoup import BeautifulSoup
from urllib2 import urlopen

url = "https://scrapebook22.appspot.com/"
response = urlopen(url).read()
soup = BeautifulSoup(response)

print soup.html.head.title.string

for link in soup.findAll("a"):
    if link.string == "See full profile":
        person_url = "https://scrapebook22.appspot.com" + link["href"]
        person_html = urlopen(person_url).read()
        person_soup = BeautifulSoup(person_html)
        email = person_soup.find("span", attrs={"class": "email"}).string
        name = person_soup.find("div", attrs={"class": "col-md-8"}).h1.string
        city = person_soup.find("span", attrs={"data-city": True}).string
        print name + "," + email + "," + city

        csv_file = open("list.csv", "w")
        csv_file.write(name + "," + email + "," + city + "\n")

        csv_file.close()
Пример #10
0
 def soup(self, data):
     return BeautifulSoup(data)
Пример #11
0
def get_polls_all(fd, limit=None):
    """
    Scrape an file like object and return a list in which each element represents
    information about a poll
    """
    soup = BeautifulSoup(fd)

    tables = soup.findAll("table", {"class": "wikitable sortable"})

    if len(
            tables
    ) > 1:  # TODO This can actually be handled checking for info inside each table
        raise Exception("Too many tables found")

    all_trs = tables[0].findAll("tr")
    tr_lines = all_trs[1:]

    # Find out parties names
    # All names are on the first line of the table
    # Search for font tags
    span_tags = all_trs[0].findAllNext("span")
    parties_names = [f.string for f in span_tags]

    all_polls = []
    # TODO Further asserts/verifies are needed to make sure we can use this table
    for poll in tr_lines:
        if limit and len(all_polls) >= limit:
            _log.debug("Stopped parsing. Already parsed until limit = %s" %
                       limit)
            break

        cells = poll.findAll("td")

        if len(cells) != 9:
            _log.info(
                "Stopping parsing. Line does not have 9 columns. We need 8 columns to parse stats."
            )
            break

        cells_t = [
            _clean_data(c.string) if c.string is not None else None
            for c in cells
        ]

        a_tag = cells[1].find('a')
        href = dict(a_tag.attrs)["href"]
        institute = a_tag.string

        current_poll_data = {
            "date": _clean_data(
                cells_t[0]
            ),  # We actually handle this OK, but clients will probably have problems
            "source": {
                "href": href,
                "name": institute,
            },
            "parties": {}
        }

        current_poll_data["parties"].update(
            (party, cells_t[n])
            for party, n in izip(parties_names, range(2, 8)))

        all_polls.append(current_poll_data)

        _log.info("Parsed polls for %s" % cells_t[0])

    return all_polls