def sanitizeHtml(value, base_url=None): value = value.replace('<div>','').replace('</div>','').replace('<p>','').replace('</p>','').replace('<span>','').replace('</span>','') rjs = r'[\s]*(&#x.{1,7})?'.join(list('javascript:')) rvb = r'[\s]*(&#x.{1,7})?'.join(list('vbscript:')) re_scripts = re.compile('(%s)|(%s)' % (rjs, rvb), re.IGNORECASE) validTags = 'br i em strong ul ol li u b a h1 h2 h3 blockquote'.split() validAttrs = 'href'.split() urlAttrs = 'href src'.split() # Attributes which should have a URL soup = BeautifulSoup(value) for comment in soup.findAll(text=lambda text: isinstance(text, Comment)): # Get rid of comments comment.extract() for tag in soup.findAll(True): if tag.name not in validTags: tag.extract() attrs = tag.attrs tag.attrs = [] for attr, val in attrs: if attr in validAttrs: val = re_scripts.sub('', val) # Remove scripts (vbs & js) if attr in urlAttrs: val = urljoin(base_url, val) # Calculate the absolute url tag.attrs.append((attr, val)) return soup.renderContents().decode('utf8')
def _parse_tabela(self, html): soup = BeautifulSoup(html) linhas = soup.findAll( 'tr', attrs={ 'onclick': re.compile(r"javascript:detalharCep\('\d+','\d+'\);") }) return [self._parse_linha_tabela(linha) for linha in linhas]
def Get6Songs(self, event): #get current song and try and see what happens #http://www.pandora.com/music/song/interpol/obstacle+1 val = self.lc_songdora_results.GetFirstSelected() if val >= 0: pandora_url = self.lc_songdora_results.GetItem(val, 3).GetText() else: partist = self.parent.current_song.artist.replace(' ', '+') ptrack = self.parent.current_song.song.replace(' ', '+') pandora_url = PANDORA_SONG_URL + urllib2.quote( partist.encode('utf8') + '/' + ptrack.encode('utf8')) self.st_songdora_song.SetLabel(self.parent.current_song.artist + ' - ' + self.parent.current_song.song) #print string_tools.unescape(pandora_url) #pandora_url = urllib2.quote(pandora_url.encode('utf8')) page = urllib2.urlopen(pandora_url) print pandora_url soup = BeautifulSoup(page) counter = 0 for songs in soup('span', id="similar_song"): #span id="similar_song" t = songs.contents[1] track = str(t).split('"')[3] link = PANDORA_SONG_URL + str(t).split('"')[1] a = songs.contents[6] soupa = BeautifulSoup(str(a)) artist = soupa.a.string b = songs.contents[11] soupb = BeautifulSoup(str(b)) album = soupb.a.string self.lc_songdora_results.InsertStringItem(counter, artist) self.lc_songdora_results.SetStringItem(counter, 1, track) self.lc_songdora_results.SetStringItem(counter, 2, album) self.lc_songdora_results.SetStringItem(counter, 3, link) counter = counter + 1
def _parse_detalhe(self, html): soup = BeautifulSoup(html.decode('ISO-8859-1')) value_cells = soup.findAll('td', attrs={'class': 'value'}) values = [cell.firstText(text=True) for cell in value_cells] localidade, uf = values[2].split('/') values_dict = { 'Logradouro': values[0], 'Bairro': values[1], 'Localidade': localidade, 'UF': uf, 'CEP': values[3] } return values_dict
class CorreiosWebsiteScraper(object): def __init__(self, http_client=urllib2): self.url = 'http://websro.correios.com.br/sro_bin/txect01$.QueryList?P_ITEMCODE=&P_LINGUA=001&P_TESTE=&P_TIPO=001&P_COD_UNI=' self.http_client = http_client def get_encomenda_info(self, numero): request = self.http_client.urlopen('%s%s' % (self.url, numero)) html = request.read() request.close() if html: encomenda = Encomenda(numero) [ encomenda.adicionar_status(status) for status in self._get_all_status_from_html(html) ] return encomenda def _get_all_status_from_html(self, html): html_info = re.search('.*(<table.*</TABLE>).*', html, re.S) try: table = html_info.group(1) except AttributeError, e: return [-1] soup = BeautifulSoup(table) status = [] count = 0 for tr in soup.table: if count > 4 and str(tr).strip() != '': if re.match(r'\d{2}\/\d{2}\/\d{4} \d{2}:\d{2}', tr.contents[0].string): status.append( Status(data=unicode(tr.contents[0].string), local=unicode(tr.contents[1].string), situacao=unicode(tr.contents[2].font.string))) else: status[len(status) - 1].detalhes = unicode( tr.contents[0].string) count = count + 1 return status
def assertGetLink(self, response, link_class, link_no, of_count=-1, msg=''): """Tries to find an anchor element with the given class from the response. Checks that there are of_count total links of that class (unless of_count==-1). Gets the page from that link. """ self.assertWellformed(response) parsed = BeautifulSoup.BeautifulSoup(response.content) found = parsed.findAll('a', attrs = { 'class': link_class}) anchors = [a for a in found] if of_count > -1: self.assertEqual(len(anchors), of_count, msg) a = anchors[link_no] href = a['href'] # 2.4 sgmllib/HTMLParser doesn't decode HTML entities, this # fixes the query parameter separator. # TODO(mikie): how do we properly detect the sgmllib version? if int(sys.version.split(' ')[0].split('.')[1]) < 5: href = href.replace('&', '&') args = util.href_to_queryparam_dict(href) args['confirm'] = 1 url = response.request['PATH_INFO'] return self.client.get(url, args)
#!/usr/bin/env python # -*- coding: utf-8 -*- import re import urllib from beautifulsoup import BeautifulSoup wiki_url = 'https://en.wikipedia.org/wiki/Game_of_Thrones' wiki_html = urllib.urlopen(wiki_url).read() wiki_content = BeautifulSoup(wiki_html) seasons_table = wiki_content.find('table', attrs={'class': 'wikitable'}) seasons = seasons_table.findAll( 'a', attrs={'href': re.compile('\/wiki\/Game_of_Thrones_\(season_?[0-9]+\)')}) views = 0 for season in seasons: season_url = 'https://en.wikipedia.org' + season['href'] season_html = urllib.urlopen(season_url).read() season_content = BeautifulSoup(season_html) episodes_table = season_content.find( 'table', attrs={'class': 'wikitable plainrowheaders wikiepisodetable'}) if episodes_table: episode_rows = episodes_table.findAll('tr', attrs={'class': 'vevent'}) if episode_rows:
def run(self): in_soup = BeautifulSoup(self.markup_source) out_soup = BeautifulSoup() env = {} return self.__walk(in_soup, out_soup, env).prettify()
#!/usr/bin/env python # -*- coding: utf-8 -*- from beautifulsoup import BeautifulSoup from urllib2 import urlopen url = "https://scrapebook22.appspot.com/" response = urlopen(url).read() soup = BeautifulSoup(response) print soup.html.head.title.string for link in soup.findAll("a"): if link.string == "See full profile": person_url = "https://scrapebook22.appspot.com" + link["href"] person_html = urlopen(person_url).read() person_soup = BeautifulSoup(person_html) email = person_soup.find("span", attrs={"class": "email"}).string name = person_soup.find("div", attrs={"class": "col-md-8"}).h1.string city = person_soup.find("span", attrs={"data-city": True}).string print name + "," + email + "," + city csv_file = open("list.csv", "w") csv_file.write(name + "," + email + "," + city + "\n") csv_file.close()
def soup(self, data): return BeautifulSoup(data)
def get_polls_all(fd, limit=None): """ Scrape an file like object and return a list in which each element represents information about a poll """ soup = BeautifulSoup(fd) tables = soup.findAll("table", {"class": "wikitable sortable"}) if len( tables ) > 1: # TODO This can actually be handled checking for info inside each table raise Exception("Too many tables found") all_trs = tables[0].findAll("tr") tr_lines = all_trs[1:] # Find out parties names # All names are on the first line of the table # Search for font tags span_tags = all_trs[0].findAllNext("span") parties_names = [f.string for f in span_tags] all_polls = [] # TODO Further asserts/verifies are needed to make sure we can use this table for poll in tr_lines: if limit and len(all_polls) >= limit: _log.debug("Stopped parsing. Already parsed until limit = %s" % limit) break cells = poll.findAll("td") if len(cells) != 9: _log.info( "Stopping parsing. Line does not have 9 columns. We need 8 columns to parse stats." ) break cells_t = [ _clean_data(c.string) if c.string is not None else None for c in cells ] a_tag = cells[1].find('a') href = dict(a_tag.attrs)["href"] institute = a_tag.string current_poll_data = { "date": _clean_data( cells_t[0] ), # We actually handle this OK, but clients will probably have problems "source": { "href": href, "name": institute, }, "parties": {} } current_poll_data["parties"].update( (party, cells_t[n]) for party, n in izip(parties_names, range(2, 8))) all_polls.append(current_poll_data) _log.info("Parsed polls for %s" % cells_t[0]) return all_polls