def tryParseDetails(htmlTxt, updateString): htmlTxt = removeCloseTagAttr(htmlTxt) soup = BeautifulSoup() soup.feed(htmlTxt) flightInfo = [] table = soup.first("table", {"name": "flight_info"}) trList = [] if table: for tr in table.fetch("tr"): if len(tr.fetch("td")) == 4: trList.append(tr) elif len(tr.fetch("td")) == 1: img = tr.first("img", {"alt": "Continuing on To"}) if img: trList.append(tr) for tr in trList: tdList = tr.fetch("td") if len(tdList) == 4: info = getAllTextFromTag(tdList[0]).replace(" ", " ").strip() infoFrom = getAllTextFromTag(tdList[1]).replace(" ", " ").strip() infoTo = getAllTextFromTag(tdList[3]).replace(" ", " ").strip() if info != "": flightInfo.append([info, infoFrom, infoTo]) else: flightInfo.append([""]) flight = "" table = soup.first("table", {"name": "headbar2"}) if table: bItem = table.first("b") if bItem: flight = getAllTextFromTag(bItem) if 0 == len(flightInfo) or "" == flight: return UNKNOWN_FORMAT, None # definition df = Definition() df.TextElement(flight, style=styleNameBold) df.LineBreakElement(1, 2) index = 0 for item in flightInfo: # info, from, to if len(item) == 3: df.TextElement(item[0], style=styleNameHeader) if item[1] != "": df.LineBreakElement() df.TextElement(item[1]) if item[2] != "": gtxt = df.TextElement(item[2]) gtxt.setJustification(justRight) else: df.LineBreakElement() else: df.HorizontalLineElement() return RESULTS_DATA, universalDataFormatWithDefinition( df, [["U", updateString]])
class Julienne: def __init__(self, table): self.soup = BeautifulSoup(table.strip()) self.row_list = self.soup.first("tbody").findAll("tr") def validate(self): valid_toplevel = len(self.soup.contents) == 1 and self.soup.contents[0].name == "table" num_columns = len(self.soup.first("thead").contents) rows = self.row_list valid_body = all([len(row) == num_columns for row in rows]) return valid_toplevel and valid_body def columns(self): return [tag.string for tag in self.soup.findAll("th")] def rows(self): rows_sans_whitespace = [[unicode(field.string) for field in row if field != '\n'] for row in self.row_list] return [OrderedDict(zip(self.columns(), row)) for row in rows_sans_whitespace] def select(self, **kwargs): # TODO: Implement selecting rows, possibly by some index desired_cols = kwargs['columns'] rows = self.rows() return map(lambda row: { key: row[key] for key in desired_cols }, rows) def to_csv(self): csv_str = ",".join(self.columns()) + "\n" csv_str += "\n".join([",".join(row.viewvalues()) for row in self.rows()]) return csv_str
def _spider_book_info(url, letter): try: html = getHttp(url, handleException = False) soup = BeautifulSoup() soup.feed(html) h1 = soup.first("h1") if h1 is None: return None assert h1 is not None title = retrieveContents(h1).decode("iso-8859-1") subtitle = None author = None code = None labels = [retrieveContents(x) for x in soup.fetch("span", {"class": "title-label"})] data = soup.fetch("span", {"class": "title-data"}) try: index = labels.index("Subtitle") subtitle = retrieveContents(data[index]).decode("iso-8859-1") except ValueError: pass try: index = labels.index("Author") author = retrieveContents(data[index].first("a")).decode("iso-8859-1") except ValueError: pass try: index = labels.index("Language") href = str(data[index].first("a", {"href": "/language.php?code=%"})["href"]) code = href[19:href.find("&", 19)].decode("iso-8859-1") except ValueError: pass tid = soup.first("input", {"type": "hidden", "name": "tid"}) assert tid is not None book_id = tid["value"].decode("iso-8859-1") print (u"%s: \"%s\"" % (author, title)).encode("iso-8859-1", "ignore") sel = soup.first("select", {"name": "book"}) assert sel is not None opts = sel.fetch("option") formats = [] for opt in opts: try: format = retrieveContents(opt).split()[0] if format not in ebooks.FORMATS: continue val = opt["value"] formats.append((format, val)) except Exception, ex: log(SEV_EXC, exceptionAsStr(ex)) formats.sort() return (url, title, subtitle, author, book_id, code, formats)
def __call__(self, url): try: doc = urllib2.urlopen(url).read() except urllib2.URLError: return None doc = BeautifulSoup(urllib2.urlopen(url).read()) title = u'' description = u'' # title title = doc.title.string if not title: title = doc.first('meta', attrs={'name': 'title'}) if title: title = title.get('content') # description description = doc.first('meta', attrs={'name': 'description'}) if description: description = description.get('content') # Find favicon favicon_url = doc.first('link', rel='shortcut icon') if favicon_url: favicon_url = favicon_url.get('href') else: host_url = urlparse(url) favicon_url = host_url[0] + u'://' + host_url[1] + u'/favicon.ico' return json.dumps({ 'title': title, 'description': description, 'favicon_url': favicon_url})
def tryParseDetails(htmlTxt, updateString): htmlTxt = removeCloseTagAttr(htmlTxt) soup = BeautifulSoup() soup.feed(htmlTxt) flightInfo = [] table = soup.first("table", {"name":"flight_info"}) trList = [] if table: for tr in table.fetch("tr"): if len(tr.fetch("td")) == 4: trList.append(tr) elif len(tr.fetch("td")) == 1: img = tr.first("img", {"alt":"Continuing on To"}) if img: trList.append(tr) for tr in trList: tdList = tr.fetch("td") if len(tdList)==4: info = getAllTextFromTag(tdList[0]).replace(" "," ").strip() infoFrom = getAllTextFromTag(tdList[1]).replace(" "," ").strip() infoTo = getAllTextFromTag(tdList[3]).replace(" "," ").strip() if info != "": flightInfo.append([info, infoFrom, infoTo]) else: flightInfo.append([""]) flight = "" table = soup.first("table", {"name":"headbar2"}) if table: bItem = table.first("b") if bItem: flight = getAllTextFromTag(bItem) if 0==len(flightInfo) or ""==flight: return UNKNOWN_FORMAT, None # definition df = Definition() df.TextElement(flight, style=styleNameBold) df.LineBreakElement(1,2) index = 0 for item in flightInfo: # info, from, to if len(item) == 3: df.TextElement(item[0], style=styleNameHeader) if item[1] != "": df.LineBreakElement() df.TextElement(item[1]) if item[2] != "": gtxt = df.TextElement(item[2]) gtxt.setJustification(justRight) else: df.LineBreakElement() else: df.HorizontalLineElement() return RESULTS_DATA, universalDataFormatWithDefinition(df, [["U",updateString]])
def scrapeList(self): """ Scrapes the pages for a list. Saves the list index page locally. Sends for scraping of each month's pages (and then on to the individual messages). """ # Get the page that list the months of archive pages. source = self.fetchPage(self.list_url) # The copy of the page we save is filtered for email addresses, links, etc. filtered_source = self.filterPage(source) # Save our local copy. # eg /Users/phil/Sites/examplesite/html/list-name/index.html local_index = open(self.publish_dir + '/index.html', 'w') local_index.write(filtered_source) local_index.close() soup = BeautifulSoup(source) if not soup.first('table'): return # Go through each row in the table except the first (which is column headers). for row in soup.first('table')('tr')[1:]: # Get the text in the first column: "February 2009:" archive_date = row('td')[0].string if 'quarter' in archive_date: (ordinal, nothing , year) = archive_date.split() quarter_no = str(['First','Second','Third','Fourth'].index(ordinal)+1) # Strip the colon off. year = year[:-1] formatted_date = year+'q'+quarter_no else: if ' ' in archive_date: (month, year) = archive_date.split() # Strip the colon off. year = year[:-1] formatted_date = year+'-'+month else: formatted_date=archive_date[:-1] # Scrape the date page for this month and get all its messages. # keep_fetching will be True or False, depending on whether we need to keep getting older months. try: keep_fetching = self.scrapeMonth(formatted_date) except urllib2.HTTPError: print "Skipping ",formatted_date keep_fetching = True if not keep_fetching: break;
def personSearch(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) noResults = m411testNoResults(soup) if NO_RESULTS == noResults: return (noResults,m411NoResultsText) ttlPref = soup.first("td",{"class":"TTLPREF"}) if not ttlPref: ttlPref = soup.first("span",{"class":"TTLPREF"}) if not ttlPref: return (UNKNOWN_FORMAT,m411UnknownFormatText) # too many results: font = ttlPref.first("font",{"color":"#FF0000"}) if font: if "No results." == font.contents[0]: return (NO_RESULTS,m411NoResultsText) if "Results found in multiple cities." == font.contents[0]: brList = ttlPref.fetch("br") brList = brList[4:] ## skip text about select for br in brList: text = str(br.next).replace("<br />","").replace("\n","").strip() if len(text) > 0: returned.append(text) return (MULTIPLE_SELECT, string.join(returned,"\n")) return (TOO_MANY_RESULTS,m411TooManyResults) # results: brList = ttlPref.fetch("br") resultsCount = len(brList) - 2 if 0 == resultsCount: # no city? if "NO CITY FOUND" == str(brList[1].next).replace("\n","").strip(): return (NO_CITY,m411NoCity) results = resultsCount/5 if results*5 != resultsCount: ## test if number of <br> is 5*n+2 return (UNKNOWN_FORMAT,m411UnknownFormatText) # get them brList = brList[1:] ## skip first br counter = 0 smallList = [] for br in brList: text = str(br.next).replace("<br />","").replace("\n","").strip() if results > 0: if 0 == counter: smallList = [text] if 1 == counter or 2 == counter: smallList.append(text) if 3 == counter: smallList.append(text) returned.append(smallList) results -= 1 counter += 1 if 5 == counter: counter = 0 return (RESULTS_DATA,universalDataFormatReplaceEntities(returned))
def _parseRandomJoke(htmlTxt): soup = BeautifulSoup() soup.feed(htmlTxt) table = soup.first("table", {"id": "jokeIframeTable2"}) if not table: return UNKNOWN_FORMAT, None # title titleSpan = table.first("span", {"class": "jokeTitle_v2"}) if not titleSpan: return UNKNOWN_FORMAT, None title = getAllTextFromTag(titleSpan) # text trList = table.fetch("tr") text = "" if len(trList) > 6: tdList = trList[5].fetch("td") if len(tdList) == 3: text = getAllTextFromToInBrFormat(tdList[1], tdList[2]) if len(text.replace(" ", " ").strip()) < 2: text = "" if "" == text: return UNKNOWN_FORMAT, None smallList = [title, text] # rating table = soup.first("table", {"id": "Table5"}) if table: td = table.first("td") if td: imgList = td.fetch("img", {"src": "%"}) rating = "not rated" translator = { "iconrate_one": "1", "iconrate_two": "2", "iconrate_three": "3", "iconrate_four": "4", "iconrate_five": "5", "iconrate_one_half": "1.5", "iconrate_two_half": "2.5", "iconrate_three_half": "3.5", "iconrate_four_half": "4.5", "iconrate_zero_half": "0.5", } for img in imgList: src = img["src"] src = src.split("/")[-1] src = src.replace(".gif", "") try: rat = translator[src] rating = rat except: pass smallList.append(rating) outerList = [smallList] return (JOKE_DATA, universalDataFormatReplaceEntities(outerList))
def _parseRandomJoke(htmlTxt): soup = BeautifulSoup() soup.feed(htmlTxt) table = soup.first("table", {"id": "jokeIframeTable2"}) if not table: return UNKNOWN_FORMAT, None # title titleSpan = table.first("span", {"class": "jokeTitle_v2"}) if not titleSpan: return UNKNOWN_FORMAT, None title = getAllTextFromTag(titleSpan) # text trList = table.fetch("tr") text = "" if len(trList) > 6: tdList = trList[5].fetch("td") if len(tdList) == 3: text = getAllTextFromToInBrFormat(tdList[1], tdList[2]) if len(text.replace(" ", " ").strip()) < 2: text = "" if "" == text: return UNKNOWN_FORMAT, None smallList = [title, text] # rating table = soup.first("table", {"id": "Table5"}) if table: td = table.first("td") if td: imgList = td.fetch("img", {"src": "%"}) rating = "not rated" translator = { "iconrate_one": "1", "iconrate_two": "2", "iconrate_three": "3", "iconrate_four": "4", "iconrate_five": "5", "iconrate_one_half": "1.5", "iconrate_two_half": "2.5", "iconrate_three_half": "3.5", "iconrate_four_half": "4.5", "iconrate_zero_half": "0.5", } for img in imgList: src = img['src'] src = src.split("/")[-1] src = src.replace(".gif", "") try: rat = translator[src] rating = rat except: pass smallList.append(rating) outerList = [smallList] return (JOKE_DATA, universalDataFormatReplaceEntities(outerList))
def _parse_letter_page(self, letter, html, index): self._check_finish() soup = BeautifulSoup() soup.feed(html) div = soup.first("div", {"class": "sidebar-module"}) assert div is not None count = int(retrieveContents(div.contents[2]).split()[2]) offset = 0 self._lock.acquire() try: if count <= self._data[letter][0]: print 'Letter "%s" is up to date (%d records).' % (letter, self._data[letter][0]) return True, count, 0 offset = self._offsets[letter] finally: self._lock.release() spidered = 0 div = soup.first("div", {"class": "titleList"}) assert div is not None as = div.fetch("a") urls = [] for a in as: url = _g_manybooks_url + urllib.quote(a["href"]) urls.append(url) for url in urls: self._check_finish() i = -1 self._lock.acquire() try: books = self._data[letter][1] i = _find_book_index(books, url, index) finally: self._lock.release() if -1 != i: index = i + 1 else: book = _spider_book_info(url, letter) if book is not None: spidered += 1 self._lock.acquire() try: self._fresh_books.append((letter, index + offset, book)) if len(self._fresh_books) == self.flush_after: self._flush_books() offset += 1 self._offsets[letter] = offset if self._data[letter][0] + offset == count: return True, count, spidered finally: self._lock.release() return (index + offset == count), index, spidered
def scrapeList(self): """ Scrapes the pages for a list. Saves the list index page locally. Sends for scraping of each month's pages (and then on to the individual messages). """ # Get the page that list the months of archive pages. source = self.fetchPage(self.list_url) # The copy of the page we save is filtered for email addresses, links, etc. filtered_source = self.filterPage(source) # Save our local copy. # eg /Users/phil/Sites/examplesite/html/list-name/index.html local_index = open(self.publish_dir + '/index.html', 'w') local_index.write(filtered_source) local_index.close() soup = BeautifulSoup(source) if not soup.first('table'): return # Go through each row in the table except the first (which is column headers). for row in soup.first('table')('tr')[1:]: # Get the text in the first column: "February 2009:" archive_date = row('td')[0].string if 'quarter' in archive_date: (ordinal, nothing, year) = archive_date.split() quarter_no = str( ['First', 'Second', 'Third', 'Fourth'].index(ordinal) + 1) # Strip the colon off. year = year[:-1] formatted_date = year + 'q' + quarter_no else: if ' ' in archive_date: (month, year) = archive_date.split() # Strip the colon off. year = year[:-1] formatted_date = year + '-' + month else: formatted_date = archive_date[:-1] # Scrape the date page for this month and get all its messages. # keep_fetching will be True or False, depending on whether we need to keep getting older months. try: keep_fetching = self.scrapeMonth(formatted_date) except urllib2.HTTPError: print "Skipping ", formatted_date keep_fetching = True if not keep_fetching: break
def get_slides(args): contents = get_file_contents(args.file) soup = BeautifulSoup(markdown(contents)) hsoup = BeautifulSoup() html = Tag(hsoup, 'html') hsoup.append(html) head = Tag(hsoup, 'head') title = Tag(hsoup, 'title') title.setString(args.file) head.append(title) link = Tag(hsoup, 'link') link['rel'] = 'stylesheet' link['type'] = 'text/css' if args.offline: link['href'] = 'default.css' else: link[ 'href'] = 'http://gdg-xian.github.io/html5slides-markdown/themes/default.css' head.append(link) script = Tag(hsoup, 'script') if args.offline: script['src'] = 'html5slides.js' else: script[ 'src'] = 'http://gdg-xian.github.io/html5slides-markdown/javascripts/html5slides.js' head.append(script) html.append(head) body = Tag(hsoup, 'body') body['style'] = 'display:none' section = Tag(hsoup, 'section') section['class'] = 'slides layout-regular template-default' body.append(section) elements = [] elements.append(soup.first()) elements.extend(soup.first().findNextSiblings()) article = Tag(hsoup, 'article') section.append(article) for element in elements: if element.name == 'hr': article = Tag(hsoup, 'article') section.append(article) else: article.append(element) html.append(body) return prettify(html)
def parseJoke(htmlTxt): soup = BeautifulSoup() soup.feed(htmlTxt) table = soup.first("table", {"width": "328", "id": "Table2"}) if not table: return (UNKNOWN_FORMAT, jUnknownFormatText) tdList = table.fetch("td", { "colspan": "3", "valign": "top", "class": "body" }) if 3 != len(tdList): return (UNKNOWN_FORMAT, jUnknownFormatText) # simple format - simple parser title = getAllTextFromTag(tdList[0]).strip() text = getAllTextFromToInBrFormat(tdList[1], tdList[2].previous) smallList = [title, text] # add rating information if len(title) + len( text ) > 16: # in random joke sometimes it returns small nothing... so to be sure span = soup.first("span", {"class": "body"}) if span: text = getAllTextFromTag(span).replace("\n", "").strip() img = span.first("img", {"src": "%"}) if text.startswith("CURRENT RATING") and img: src = img['src'] src = src.split("/")[-1] src = src.replace(".gif", "") translator = { "iconrate_one": "1", "iconrate_two": "2", "iconrate_three": "3", "iconrate_four": "4", "iconrate_five": "5", "iconrate_one_half": "1.5", "iconrate_two_half": "2.5", "iconrate_three_half": "3.5", "iconrate_four_half": "4.5", "iconrate_zero_half": "0.5", } rating = "not rated" try: rating = translator[src] except: pass smallList.append(rating) outerList = [smallList] return (JOKE_DATA, universalDataFormatReplaceEntities(outerList))
def get_slides(args): contents = get_file_contents(args.file) soup = BeautifulSoup(markdown(contents)) hsoup = BeautifulSoup() html = Tag(hsoup, 'html') hsoup.append(html) head = Tag(hsoup, 'head') title = Tag(hsoup, 'title') title.setString(args.file) head.append(title) link = Tag(hsoup, 'link') link['rel'] = 'stylesheet' link['type'] = 'text/css' if args.offline: link['href'] = 'default.css' else: link['href'] = 'http://gdg-xian.github.io/html5slides-markdown/themes/default.css' head.append(link) script = Tag(hsoup, 'script') if args.offline: script['src'] = 'html5slides.js' else: script['src'] = 'http://gdg-xian.github.io/html5slides-markdown/javascripts/html5slides.js' head.append(script) html.append(head) body = Tag(hsoup, 'body') body['style'] = 'display:none' section = Tag(hsoup, 'section') section['class'] = 'slides layout-regular template-default' body.append(section) elements = [] elements.append(soup.first()) elements.extend(soup.first().findNextSiblings()) article = Tag(hsoup, 'article') section.append(article) for element in elements: if element.name == 'hr': article = Tag(hsoup, 'article') section.append(article) else: article.append(element) html.append(body) return prettify(html)
def onData(self, info, data): try: self._logger.RTC_INFO('got input: ' + pformat(info) + ', ' + pformat(data)) t = data.tm.sec + data.tm.nsec * 1e-9 portid = (info['component'], info['port']) if portid not in self._basewme: self._logger.RTC_INFO('First input from this port > create basic structure on WME') iid = self._agent.GetInputLink() wme = self._agent.CreateIdWME(iid, 'data') self._basewme[portid] = wme ot = type(data.data) if ot in types.StringTypes: if data.data[:5] == '<?xml': self._logger.RTC_INFO('Parsing XML type input') doc = BeautifulSoup(data.data) wme2 = self._agent.CreateIdWME(wme, 'data') usedwords = {} self.docRecur(doc.first(), wme2, usedwords) self._datawme[portid] = wme2 else: self._datawme[portid] = self._agent.CreateStringWME(wme, 'data', data.data) elif ot == types.IntType: self._datawme[portid] = self._agent.CreateIntWME(wme, 'data', data.data) elif ot == types.FloatType: self._datawme[portid] = self._agent.CreateFloatWME(wme, 'data', data.data) else: self._logger.RTC_ERROR('unsupported data type: ' + str(ot)) self._timewme[portid] = self._agent.CreateFloatWME(wme, 'time', t) self._dataidwme[portid] = self._agent.CreateIntWME(wme, 'id', self._dataid) for k, v in info.iteritems(): self._agent.CreateStringWME(wme, k, v) else: self._agent.Update(self._timewme[portid], t) self._agent.Update(self._dataidwme[portid], self._dataid) if type(data.data) in types.StringTypes and data.data[:5] == '<?xml': self._logger.RTC_INFO('Parsing XML type input') doc = BeautifulSoup(data.data) self._agent.DestroyWME(self._datawme[portid]) wme2 = self._agent.CreateIdWME(self._basewme[portid], 'data') usedwords = {} self.docRecur(doc.first(), wme2, usedwords) self._datawme[portid] = wme2 else: self._agent.Update(self._datawme[portid], data.data) self._agent.Commit() self._dataid += 1 except: self._logger.RTC_ERROR(traceback.format_exc())
def getTextFromDirtyText(dirtyText): soup = BeautifulSoup() soup.feed("<xxx>" + dirtyText + "</xxx><yyy>test</yyy>") dirtySoup = soup.first("xxx") textWithBr = getAllTextFromToInBrFormat(dirtySoup, getLastElementFromTag(dirtySoup).next) text = textWithBr.replace("<br>", "\n").replace("<b>", "").replace("</b>", "") return text
def parseGasOld(htmlTxt, url=None, dbgLevel=0): soup = BeautifulSoup() soup.feed(htmlTxt) testTitle = soup.first("title") if testTitle: if getAllTextFromTag(testTitle).startswith("GasBuddy.com - Find cheap gas prices in your city"): return (LOCATION_UNKNOWN, gLocationUnknownText) outerList = [] trList = soup.fetch("tr") for trItem in trList: tdList = trItem.fetch("td") if 8 == len(tdList): if tdList[1].first("table"): price = getAllTextFromTag(tdList[0]).strip() name = getAllTextFromTag(tdList[2]).strip() address = getAllTextFromTag(tdList[4]).strip() area = getAllTextFromTag(tdList[5]).strip() time = getAllTextFromTag(tdList[6]).strip() smallList = [price, name, address, area, time] outerList.append(smallList) else: if 0 != len(tdList): firstB = tdList[0].first("b") if firstB: if getAllTextFromTag(firstB).startswith("No gas prices found."): return (NO_RESULTS, gNoResultsText) if 0 == len(outerList): if dbgLevel > 0: print "len(outerList)==0" return parsingFailed(url, htmlTxt) return (GAS_DATA, universalDataFormatReplaceEntities(outerList))
def reversePhoneLookupWhitepages(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) noResults = m411testNoResultsReversePhoneLookup(soup) if NO_RESULTS == noResults: return (noResults,m411NoResultsText) div = soup.first("div", {"class":"listings"}) if div: for table in div.fetch("table"): for tr in table.fetch("tr"): text1 = tr.first("div",{"class":"textb"}) text2 = tr.first("div",{"class":"text"}) if text1 and text2: name = getAllTextFromTag(text1) cont = getAllTextFromToInBrFormat(text2, getLastElementFromTag(text2).next) parts = cont.split("<br>") (address,city,phone) = ("","","") if len(parts) == 3: (address,city,phone) = parts if len(parts) == 2: (city,phone) = parts if len(parts) == 4: (prefix,address,city,phone) = parts returned.append((name,address.strip(),city.strip(),phone.strip())) if len(returned) == 0: return UNKNOWN_FORMAT, None return (RESULTS_DATA,universalDataFormatReplaceEntities(returned))
def parseFirstDayHtml(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) # sky, now, feelsLike, wind, hum, preasure, dev point, visibility returned = [None, None, None, None, None, None, None, "N/A"] bItems = soup.fetch("b", {"class":"obsTextA"}) if len(bItems) == 2: returned[0] = getAllTextFromTag(bItems[0]).strip() temp = getAllTextFromTag(bItems[1]).strip().split("Like ") if len(temp) > 1: returned[2] = temp[1].replace("°F","").strip() bItem = soup.first("b", {"class":"obsTempTextA"}) if bItem: returned[1] = getAllTextFromTag(bItem).replace("°F","").strip() tdList = soup.fetch("td", {"class":"obsTextA"}) if len(tdList) == 8: tdList = tdList[1::2] assert (len(tdList) == 4) returned[3] = getAllTextFromTag(tdList[0]).strip() returned[4] = getAllTextFromTag(tdList[1]).replace("%","").strip() returned[5] = getAllTextFromTag(tdList[2]).replace("in.","inches").strip() ##todo: down, up, ... returned[6] = getAllTextFromTag(tdList[3]).replace("°F","").strip() for r in returned: if r == None or r == "": return None return returned
def retrievePostText(self, url): mUrl = url.replace("http://ca.indeed.com/", "http://ca.indeed.com/m/") html = self.doHttpRequest(mUrl) # description is located within <div id='desc'> soup = BeautifulSoup(html) div = soup.first('div', {'id': 'desc'}) return self.fetch_body(div)
def get_title(url): """Fetches the contents of url and extracts (and utf-8 encodes) the contents of <title>""" if not url or not url.startswith("http://"): return None try: # if we don't find it in the first kb of the resource, we # probably won't find it opener = urlopen(url, timeout=15) text = opener.read(1024) opener.close() bs = BeautifulSoup(text) if not bs: return title_bs = bs.first("title") if not title_bs or title_bs.children: return return title_bs.text.encode("utf-8") except: return None
def cleanupResults(results): ''' Extract and cleanup the returned HTML. Retrn a pipe seperated list. ''' pipeSeparatedData = [] status = results.status if status == 200: data = results.read() soup = BeautifulSoup(data) target_HTML = soup.first('pre') # grab text inside <pre html tags # break it up into lines targetLines = [] for line in target_HTML: targetLines.append(line) # we're interested in line 2 # break it down to a list sunriseDataList = targetLines[2].split('\n') # strip leading/trailing space sunriseDataList = map(stripit, sunriseDataList) # remove empty members sunriseDataList = filter(None, sunriseDataList) # parse line to produce pipe separated name/value string for en in sunriseDataList: icu = re.sub(r' +', '|', en) pipeSeparatedData.append(icu) # sample output # [u'Begin civil twilight|6:56 a.m.', u'Sunrise|7:21 a.m.', u'Sun transit|1:32 p.m.', # u'Sunset|7:44 p.m.', u'End civil twilight|8:08 p.m.'] else: print 'Error from Sunrise/Sunset website - no data available' return pipeSeparatedData
def getTorrentDetails(url): from BeautifulSoup import BeautifulSoup, SoupStrainer html = getHTML(url) spanResults = BeautifulSoup(html) ltrSpan = spanResults.find('span', {'dir': 'ltr'}) if ltrSpan == None: raise Exception return if 'artist.php' not in ltrSpan.next.attrs[0][1] or ( 'Album' not in ltrSpan.nextSibling and 'Anthology' not in ltrSpan.nextSibling and 'Compilation' not in ltrSpan.nextSibling and 'Single' not in ltrSpan.nextSibling and 'Soundtrack' not in ltrSpan.nextSibling and 'EP' not in ltrSpan.nextSibling): raise Exception return tagStrainer = SoupStrainer('a', href=re.compile('torrents.php\?taglist\=')) tagResults = BeautifulSoup(html, tagStrainer) artist = '' for element in ltrSpan.contents[:-1]: try: artist += element.string except: artist += element album = ltrSpan.contents[-1][3:] genre = tagResults.first().string return (unescape(artist), unescape(album), genre)
def test_link_other_proj_no_html2text(self): # without html2text, the dash in other-project doesn't get escaped right html = BeautifulSoup('''<pre>Foo: <a href="/p/other-project/issues/detail?id=1">issue other-project:1</a></pre>''') assert_equal( _as_markdown(html.first(), 'myproj'), 'Foo: [issue other\\-project:1](https://code.google.com/p/other-project/issues/detail?id=1)' )
def get_title(url): """Fetches the contents of url and extracts (and utf-8 encodes) the contents of <title>""" if not url or not url.startswith('http://'): return None try: # if we don't find it in the first kb of the resource, we # probably won't find it opener = urlopen(url, timeout=15) text = opener.read(1024) opener.close() bs = BeautifulSoup(text) if not bs: return title_bs = bs.first('title') if not title_bs or title_bs.children: return return title_bs.text.encode('utf-8') except: return None
def retrievePostText(self, url): mUrl = url.replace("http://ca.indeed.com/", "http://ca.indeed.com/m/") html = self.doHttpRequest(mUrl) # description is located within <div id='desc'> soup = BeautifulSoup(html) div = soup.first('div', {'id' : 'desc'}) return self.fetch_body(div)
def parseCurrencyData(htmlText): global _g_imgRe soup = BeautifulSoup() soup.feed(htmlText) # <table width="60%" border="0" cellpadding="3" summary="Displays latest tourist currency rates"> table = soup.first( "table", { "border": "0", "width": "60%", "cellpadding": "3", "summary": "Displays latest tourist currency rates" }) assert table is not None tbody = table.first("tbody") assert tbody is not None rows = tbody.fetch("tr") currencies = dict() for row in rows: cells = row.fetch("td") img = cells[0].fetch("img")[0] match = _g_imgRe.match(img["src"]) if match is None: continue abbrev = match.group(1) rate = float(str(cells[2].contents[0]).strip().split()[0]) currencies[abbrev] = rate usdRate = currencies["USD"] for key in currencies.iterkeys(): currencies[key] = currencies[key] / usdRate assert 1 == currencies["USD"] return (RESULTS_DATA, currencies)
def getTorrentDetails(url): from BeautifulSoup import BeautifulSoup, SoupStrainer html = getHTML(url) spanResults = BeautifulSoup(html) ltrSpan = spanResults.find('span', {'dir' : 'ltr'}) if ltrSpan == None: raise Exception return if 'artist.php' not in ltrSpan.next.attrs[0][1] or ('Album' not in ltrSpan.nextSibling and 'Anthology' not in ltrSpan.nextSibling and 'Compilation' not in ltrSpan.nextSibling and 'Single' not in ltrSpan.nextSibling and 'Soundtrack' not in ltrSpan.nextSibling and 'EP' not in ltrSpan.nextSibling): raise Exception return tagStrainer = SoupStrainer('a', href=re.compile('torrents.php\?taglist\=')) tagResults = BeautifulSoup(html, tagStrainer) artist = '' for element in ltrSpan.contents[:-1]: try: artist += element.string except: artist += element album = ltrSpan.contents[-1][3:] genre = tagResults.first().string return (unescape(artist), unescape(album), genre)
def save(self): #取出第一张图片的html,使用正则 soup = BeautifulSoup(self.content_html) self.content_pic = str(soup.first("img")) #soup.first("img") #只返回第一个pic,需要转化为str,否则是对象 if not self.content_pic : self.content_pic = '' super(News, self).save()
def scrapeList(self): """ Scrapes the pages for a list. Saves the list index page locally. Sends for scraping of each month's pages (and then on to the individual messages). """ # Get the page that list the months of archive pages. source = self.fetchPage(self.list_url) # The copy of the page we save is filtered for email addresses, links, etc. filtered_source = self.filterPage(source) # Save our local copy. # eg /Users/phil/Sites/examplesite/html/list-name/index.html local_index = open(self.publish_dir + "/index.html", "w") local_index.write(filtered_source) local_index.close() soup = BeautifulSoup(source) # Go through each row in the table except the first (which is column headers). for row in soup.first("table")("tr")[1:]: # Get the text in the first column: "February 2009:" (month, year) = row("td")[0].string.split() # Strip the colon off. year = year[:-1] # Scrape the date page for this month and get all its messages. # keep_fetching will be True or False, depending on whether we need to keep getting older months. keep_fetching = self.scrapeMonth(year + "-" + month) if not keep_fetching: break
def parseCurrencyData(htmlText): global _g_imgRe soup = BeautifulSoup() soup.feed(htmlText) # <table width="60%" border="0" cellpadding="3" summary="Displays latest tourist currency rates"> table = soup.first("table", {"border": "0", "width": "60%", "cellpadding": "3", "summary": "Displays latest tourist currency rates"}) assert table is not None tbody = table.first("tbody") assert tbody is not None rows = tbody.fetch("tr") currencies = dict() for row in rows: cells = row.fetch("td") img = cells[0].fetch("img")[0] match = _g_imgRe.match(img["src"]) if match is None: continue abbrev = match.group(1) rate = float(str(cells[2].contents[0]).strip().split()[0]) currencies[abbrev] = rate usdRate = currencies["USD"] for key in currencies.iterkeys(): currencies[key] = currencies[key] / usdRate assert 1 == currencies["USD"] return (RESULTS_DATA, currencies)
def parseFirstDayHtml(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) # sky, now, feelsLike, wind, hum, preasure, dev point, visibility returned = [None, None, None, None, None, None, None, "N/A"] bItems = soup.fetch("b", {"class": "obsTextA"}) if len(bItems) == 2: returned[0] = getAllTextFromTag(bItems[0]).strip() temp = getAllTextFromTag(bItems[1]).strip().split("Like ") if len(temp) > 1: returned[2] = temp[1].replace("°F", "").strip() bItem = soup.first("b", {"class": "obsTempTextA"}) if bItem: returned[1] = getAllTextFromTag(bItem).replace("°F", "").strip() tdList = soup.fetch("td", {"class": "obsTextA"}) if len(tdList) == 8: tdList = tdList[1::2] assert (len(tdList) == 4) returned[3] = getAllTextFromTag(tdList[0]).strip() returned[4] = getAllTextFromTag(tdList[1]).replace("%", "").strip() returned[5] = getAllTextFromTag(tdList[2]).replace( "in.", "inches").strip() ##todo: down, up, ... returned[6] = getAllTextFromTag(tdList[3]).replace("°F", "").strip() for r in returned: if r == None or r == "": return None return returned
def _menu_item_exists(self, menu_item_text): response = self.app.get(get_route("home")) soup = BeautifulSoup(response.body) menu = soup.first("ul", {"class": "nav"}) return bool(menu.first("a", text=menu_item_text))
def parseDream2(htmlTxt): soup = BeautifulSoup() # TODO: this is temporary: htmlTxt = htmlTxt.replace( "/*<![CDATA[*/ @import \"/knowledge/stylesheets/monobook/main.css\"; /*]]>*/", "") soup.feed(htmlTxt) tableMain = soup.fetch("table", { "width": "768", "align": "center", "cellspacing": "0", "cellpadding": "0" }) if not tableMain: return (UNKNOWN_FORMAT, dUnknownFormatText) td = None for table in tableMain: tr = table.first("tr") if tr: tdTest = tr.first("td", {"width": "100%", "valign": "top"}) if tdTest: td = tdTest if not td: return (UNKNOWN_FORMAT, dUnknownFormatText) # why without this it is not working? soup2 = BeautifulSoup() soup2.feed(str(td).replace("<br />>", "")) td = soup2.first("td") # no results? if td.first("center"): return (NO_RESULTS, dNoResultsText) # results bTable = td.fetch("b") if not bTable: return (UNKNOWN_FORMAT, dUnknownFormatText) outerList = [] for bItem in bTable: title = getAllTextFromTag(bItem) next = getLastElementFromTag(bItem) pItem = None while next and not pItem: if isinstance(next, Tag): if next.name == "p": pItem = next next = next.next if pItem: text = getAllTextFromTagWithA(pItem.first("font")) if text.startswith("Interpretation: "): text = text[len("Interpretation: "):] outerList.append((title, text)) if 0 == len(outerList): return (NO_RESULTS, dNoResultsText) return (DREAM_DATA, universalDataFormatReplaceEntities(outerList))
def parseJoke(htmlTxt): soup = BeautifulSoup() soup.feed(htmlTxt) table = soup.first("table", {"width": "328", "id": "Table2"}) if not table: return (UNKNOWN_FORMAT, jUnknownFormatText) tdList = table.fetch("td", {"colspan": "3", "valign": "top", "class": "body"}) if 3 != len(tdList): return (UNKNOWN_FORMAT, jUnknownFormatText) # simple format - simple parser title = getAllTextFromTag(tdList[0]).strip() text = getAllTextFromToInBrFormat(tdList[1], tdList[2].previous) smallList = [title, text] # add rating information if len(title) + len(text) > 16: # in random joke sometimes it returns small nothing... so to be sure span = soup.first("span", {"class": "body"}) if span: text = getAllTextFromTag(span).replace("\n", "").strip() img = span.first("img", {"src": "%"}) if text.startswith("CURRENT RATING") and img: src = img["src"] src = src.split("/")[-1] src = src.replace(".gif", "") translator = { "iconrate_one": "1", "iconrate_two": "2", "iconrate_three": "3", "iconrate_four": "4", "iconrate_five": "5", "iconrate_one_half": "1.5", "iconrate_two_half": "2.5", "iconrate_three_half": "3.5", "iconrate_four_half": "4.5", "iconrate_zero_half": "0.5", } rating = "not rated" try: rating = translator[src] except: pass smallList.append(rating) outerList = [smallList] return (JOKE_DATA, universalDataFormatReplaceEntities(outerList))
def reversePhoneLookup(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) noResults = m411testNoResultsReversePhoneLookup(soup) if NO_RESULTS == noResults: return (noResults,m411NoResultsText) tdWithResults = soup.first("td",{"class":"TTLPREF"}) if not tdWithResults: tdWithResults = soup.first("span",{"class":"TTLPREF"}) if not tdWithResults: return (UNKNOWN_FORMAT,m411UnknownFormatText) # results are inside <td> fontColor = tdWithResults.first("font") if fontColor: # "No details available." counter = 0 for br in tdWithResults.fetch("br"): # we belive that after 6th <br> is city if counter == 5: city = "%s" % str(br.next).replace("\n","").strip() returned.append(["","",city,""]) counter += 1 else: # all data, or city & phone counter = 0 person = "" address = "" city = "" phone = "" for br in tdWithResults.fetch("br"): # 7 <br> in <td> if 1 == counter: if not isinstance(br.next,Tag): person = "%s" % str(br.next).replace("\n","").strip() if 2 == counter: if not isinstance(br.next,Tag): address = "%s" % str(br.next).replace("\n","").strip() if 3 == counter: city = "%s" % str(br.next).replace("\n","").strip() if 4 == counter: phone = "%s" % str(br.next).replace("\n","").strip() counter += 1 returned.append((person,address,city,phone)) return (RESULTS_DATA,universalDataFormatReplaceEntities(returned))
def create_html_digest_for_label(email_id, threads, label, soup): threadtable = soup.find('table', {'id': label + 'table'}) threadentrytemplate = open("html_templates/thread.html").read() count = 0 thread_ids = [] for thread in threads: # print 'thread '+label, count threadsoup = BeautifulSoup(threadentrytemplate) threadtag = threadsoup.first() subject = thread['subject'] outline = thread['snippet'] thread_ids.append(thread['id']) sender = get_sender_string(email_id, thread['participants']) sendertag = threadtag.find('span', {'id': 'thsender'}) if sendertag is not None: sendertag.contents[0].replaceWith(sender) subjecttag = threadtag.find('span', {'id': 'thsubject'}) if subjecttag is not None: subjecttag.contents[0].replaceWith(subject) outlinetag = threadtag.find('div', {'id': 'thoutline'}) if outlinetag is not None: outlinetag.contents[0].replaceWith(outline) inboxonce = threadtag.find('a', {'class': 'inboxonce'}) if inboxonce is not None: inboxonce[ 'href'] = prioritizer_url + '/daily_digest/inbox_once?email=' + email_id + "&id=" + thread[ 'id'] inboxalways = threadtag.find('a', {'class': 'inboxalways'}) if inboxalways is not None: inboxalways[ 'href'] = prioritizer_url + '/daily_digest/inbox_always?email=' + email_id + "&id=" + thread[ 'id'] unsubscribe = threadtag.find('a', {'class': 'unsubscribe'}) if unsubscribe is not None: unsubscribe[ 'href'] = prioritizer_url + '/daily_digest/unsubscribe?email=' + email_id + "&id=" + thread[ 'id'] threadtable.append(threadtag) count += 1 if count > 0: labelcounttag = soup.find('span', {'id': label + 'number'}) if labelcounttag is not None: labelcounttag.contents[0].replaceWith(" (" + str(count) + ")") return soup, count, thread_ids
def parse_job_post(self, html): features = dict() soup = BeautifulSoup(html) form = soup.first('form', {'name': 'applyjob'}) features[COMPANY_NAME] = form.first('input', {'name': 'company_name'}) features[POSITION_TITLE] = form.first('input', {'name': 'position'}) features[POST_DATE] = form.first('input', {'name': 'insert_date'}) print html return (None, features)
def getTextFromDirtyText(dirtyText): soup = BeautifulSoup() soup.feed("<xxx>" + dirtyText + "</xxx><yyy>test</yyy>") dirtySoup = soup.first("xxx") textWithBr = getAllTextFromToInBrFormat( dirtySoup, getLastElementFromTag(dirtySoup).next) text = textWithBr.replace("<br>", "\n").replace("<b>", "").replace("</b>", "") return text
def parse_job_post(self, html): features = dict() soup = BeautifulSoup(html) form = soup.first('form', {'name' : 'applyjob'}) features[COMPANY_NAME] = form.first('input', {'name' : 'company_name'}) features[POSITION_TITLE] = form.first('input', {'name' : 'position'}) features[POST_DATE] = form.first('input', {'name' : 'insert_date'}) print html return (None, features)
def children(self): # this is nasty, but the children are not encoded in the OBO if len(self._children) == 0: file = utils.download('%smini' % (url_go_lookup[:-3] % self.id)) soup = BeautifulSoup(file) tab = soup.first('table') for entry in tab.contents: if isinstance(entry, Tag): self._children.append(entry.findAll('a')[1].contents[0]) return self._children
def fetchPastie(self): downloaded_page, headers = downloadUrl(self.url) if downloaded_page: htmlDom = BeautifulSoup(downloaded_page) # search for <textarea class="raw"> textarea = htmlDom.first('textarea', {'class': 'raw'}) if textarea: # replace html entities like > decoded = BeautifulSoup(textarea.contents[0], convertEntities=BeautifulSoup.HTML_ENTITIES) self.pastie_content = decoded.contents[0] return self.pastie_content
def getURLData(url): try: doc = urllib2.urlopen(url, timeout=5).read() except urllib2.URLError: return None try: doc = BeautifulSoup(urllib2.urlopen(url).read()) except UnicodeEncodeError: # This is for links to files/images. doc = BeautifulSoup('') title = url description = u'' # title if doc.title: title = doc.title.string if not title: title = doc.first('meta', attrs={'name': 'title'}) if title: title = title.get('content') # description description = doc.first('meta', attrs={'name': 'description'}) if description: description = description.get('content') # Find favicon host_url = urlparse(url) favicon_url = doc.first('link', rel='shortcut icon') if favicon_url: favicon_url = favicon_url.get('href') if not favicon_url.startswith('http'): favicon_url = host_url[0] + u'://' + host_url[1] + favicon_url else: favicon_url = host_url[0] + u'://' + host_url[1] + u'/favicon.ico' return json.dumps({ 'title': title, 'description': description, 'favicon_url': favicon_url})
def unwrap_minecraftforum(url, resp, body): urls = { 'forum': url, # This *is* the forum page! } # It might also have clues as to where the downloads are. soup = BeautifulSoup(body) post_elt = soup.first('div', 'entry-content') best_href = None best_score = 0 for a_elt in post_elt.findAll( 'a', 'bbc_url' ): # This class distinguishes URLS inserted by the author of the post. try: href = a_elt['href'] # Many entries contain self-links. if href == url: continue # Check for licence link. licence_score = sum(pat_score for (pat, pat_score) in LICENCE_SCORES if pat.search(href)) if licence_score: urls['licence'] = href continue # Otherwise, this is a candidate for download or home link. score = sum(pat_score for (pat, pat_score) in URL_SCORES if pat.search(href)) labels = [] try: label = ''.join(a_elt.findAll(text=True)) if label: labels.append(label) except AttributeError: pass # Look for label immediately preceeding link: label = a_elt.findPreviousSibling(text=True) if label: labels.append(label) for label in labels: score += sum(pat_score for (pat, pat_score) in LABEL_SCORES if pat.search(label)) if a_elt.img: score += 10 if score > best_score: best_href, best_score = href, score except KeyError, e: print >> sys.stderr, a_elt, 'did not have', e
def scrapeList(self): source = self.fetchPage(self.list_url) filtered_source = self.filterPage(source) soup = BeautifulSoup(source) for row in soup.first('table')('tr')[1:]: rel_url = row('td')[2]('a')[0].get('href') source = self.fetchPage(self.list_url + '/' + rel_url) local_month = open(self.local_dir + '/' + rel_url, 'w') local_month.write(source) local_month.close()
def getURLData(url): try: doc = urllib2.urlopen(url, timeout=5).read() except urllib2.URLError: return None try: doc = BeautifulSoup(urllib2.urlopen(url).read()) except UnicodeEncodeError: # This is for links to files/images. doc = BeautifulSoup('') title = url description = u'' # title if doc.title: title = doc.title.string if not title: title = doc.first('meta', attrs={'name': 'title'}) if title: title = title.get('content') # description description = doc.first('meta', attrs={'name': 'description'}) if description: description = description.get('content') # Find favicon host_url = urlparse(url) favicon_url = doc.first('link', rel='shortcut icon') if favicon_url: favicon_url = favicon_url.get('href') if not favicon_url.startswith('http'): favicon_url = host_url[0] + u'://' + host_url[1] + favicon_url else: favicon_url = host_url[0] + u'://' + host_url[1] + u'/favicon.ico' return {'title': title, 'description': description, 'favicon_url': favicon_url}
def fetch_pastie(self): downloaded_page, headers = download_url(self.url) if downloaded_page: htmlDom = BeautifulSoup(downloaded_page) # search for <textarea class="raw"> textarea = htmlDom.first('textarea', {'class': 'raw'}) if textarea: # replace html entities like > decoded = BeautifulSoup( textarea.contents[0], convertEntities=BeautifulSoup.HTML_ENTITIES) self.pastie_content = decoded.contents[0] return self.pastie_content
def download_script(script_id, save_to): vimhome = "http://www.vim.org/scripts/" data = urllib2.urlopen(vimhome + "/script.php?script_id=" + script_id) soup = BeautifulSoup(data) #the first row of download link table a_tag = soup.first('td', {'class': 'rowodd'}).find('a') download_link = a_tag["href"] download_filename = a_tag.text src_data = urllib2.urlopen(vimhome + download_link).read() dst_path = os.path.join(save_to, download_filename) dst_fileobj = open(dst_path, "wb") dst_fileobj.write(src_data) dst_fileobj.close()
def parseStock(htmlTxt): # this is funy htmlTxt = htmlTxt.replace("<! -- ", "<!---") soup = BeautifulSoup() soup.feed(htmlTxt) noResults = testNoResults(soup) if NO_RESULTS == noResults: return (NO_RESULTS, sNoResultsText) # get name nameTag = soup.first("td", {"height": "30", "class": "ygtb"}) if not nameTag: return (UNKNOWN_FORMAT, sUnknownFormatText) name = getAllTextFromTag(nameTag).strip() # get all data from table bigTable = soup.fetch("table", {"width": "580", "id": "yfncsumtab"}) if 1 != len(bigTable): return (UNKNOWN_FORMAT, sUnknownFormatText) tdDataList = bigTable[0].fetch("td", {"class": "yfnc_tabledata1"}) innerList = [name] counter = 0 for tdItem in tdDataList: if 2 == counter: # 3th element is with up down icon imgItem = tdDataList[2].first("img") upDown = "" if imgItem: upDown = imgItem['alt'] innerList.append(upDown) bItem = tdDataList[2].first("b") itemText = "" if bItem: itemText = getAllTextFromTag(bItem).strip() innerList.append(itemText) else: itemText = getAllTextFromTag(tdItem).strip() innerList.append(itemText) counter += 1 # any results? if 0 == counter: return (UNKNOWN_FORMAT, sUnknownFormatText) # one-item UDF outerList = [innerList] return (STOCKS_DATA, universalDataFormatReplaceEntities(outerList))
class AtomRecentActivity(RecentActivitySource): def __init__(self, lastDay, feedUrl): RecentActivitySource.__init__(self, lastDay) self.feedUrl = feedUrl def collectData(self): opener = urllib2.build_opener() self.feedXml = BeautifulSoup(opener.open(self.feedUrl)) def interpretData(self): feed = self.feedXml.first() for entry in feed.findAll('entry'): self.logEntryAsActivity(entry) def logEntryAsActivity(self, entry): self.recentActivity[entry.updated.text] += 1