def parseCurrencyData(htmlText): global _g_regionsToISO soup = BeautifulSoup() soup.feed(htmlText) tables = soup("table", {"border": "0", "width": "368", "cellpadding": "0", "cellspacing": "0"}) rows = tables[0].fetch("tr") currencies = dict() isFirstRow = True for row in rows: cells = row.fetch("td") if 4 != len(cells): continue if isFirstRow: isFirstRow = False continue region = retrieveContents(cells[1].fetch("div")[0].contents[0]) try: rate = float(retrieveContents(cells[3].fetch("div")[0].contents[0])) if "*" == retrieveContents(cells[0].contents[0]): rate = 1/rate abbrev = _g_regionsToISO[region] currencies[abbrev] = rate except ValueError: pass return (RESULTS_DATA, currencies)
def internationalCodeSearch(htmlTxt): result = [] soup = BeautifulSoup() soup.feed(htmlTxt) res = testNoResults(soup) if RESULTS_DATA != res: return (res, None) # Country code(s) tableList = soup.fetch("table", {"summary": "Codes Results"}) if len(tableList) != 1: return (UNKNOWN_FORMAT, m411UnknownFormatText) # found country code tdListA = tableList[0].fetch("td", {"style": "%padding-left:5px;"}) tdListB = tableList[0].fetch("td", {"style": "%line-height:14pt;"}) if len(tdListA) != len(tdListB): return (UNKNOWN_FORMAT, m411UnknownFormatText) cityCodeList = [] for i in range(len(tdListA)): if 0 == i: result.append([getAllTextFromTag(tdListB[i])]) else: city = getAllTextFromTag(tdListA[i]) code = getAllTextFromTag(tdListB[i]) cityCodeList.append((city, code)) # sort the (city,code) list by city cityCodeList.sort(sortByCityFunc) for el in cityCodeList: result.append(el) return (RESULTS_DATA, universalDataFormatReplaceEntities(result))
def parseCurrency(htmlTxt): soup = BeautifulSoup() soup.feed(htmlTxt) #TABLE WIDTH="100%" BORDER="0" CELLPADDING="0" CELLSPACING="0" BGCOLOR="#009900" #<TABLE WIDTH="100%" BORDER="0" CELLPADDING="1" CELLSPACING="1" BGCOLOR="#000000"> findTable = soup.fetch( "table", { "width": "100%", "border": "0", "cellpadding": "1", "cellspacing": "1", "bgcolor": "#000000" }) #print findTable if not findTable: return (UNKNOWN_FORMAT, currencyNoResultsText) itemTable = findTable[0] findTableTR = itemTable.fetch("tr") #Parse page and create dictionary for itemTR in findTableTR: findTD = itemTR.fetch("td") if 0 == len(findTD): continue if 4 != len(findTD): return (UNKNOWN_FORMAT, currencyNoResultsText) #print str(findTD[1].contents[0].contents[0].contents[0]) #print str(findTD[2].contents[0].contents[0]).replace(",","").strip() abbrev = str(findTD[1].contents[0].contents[0].contents[0]) g_AbbrevToRatesDict[abbrev] = float( str(findTD[2].contents[0].contents[0]).replace(",", "").strip()) g_AbbrevToRatesDict["USD"] = 1.0 return (RESULTS_DATA, g_AbbrevToRatesDict)
def getTextFromDirtyText(dirtyText): soup = BeautifulSoup() soup.feed("<xxx>" + dirtyText + "</xxx><yyy>test</yyy>") dirtySoup = soup.first("xxx") textWithBr = getAllTextFromToInBrFormat(dirtySoup, getLastElementFromTag(dirtySoup).next) text = textWithBr.replace("<br>", "\n").replace("<b>", "").replace("</b>", "") return text
def parseFirstDayHtml(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) # sky, now, feelsLike, wind, hum, preasure, dev point, visibility returned = [None, None, None, None, None, None, None, "N/A"] bItems = soup.fetch("b", {"class":"obsTextA"}) if len(bItems) == 2: returned[0] = getAllTextFromTag(bItems[0]).strip() temp = getAllTextFromTag(bItems[1]).strip().split("Like ") if len(temp) > 1: returned[2] = temp[1].replace("°F","").strip() bItem = soup.first("b", {"class":"obsTempTextA"}) if bItem: returned[1] = getAllTextFromTag(bItem).replace("°F","").strip() tdList = soup.fetch("td", {"class":"obsTextA"}) if len(tdList) == 8: tdList = tdList[1::2] assert (len(tdList) == 4) returned[3] = getAllTextFromTag(tdList[0]).strip() returned[4] = getAllTextFromTag(tdList[1]).replace("%","").strip() returned[5] = getAllTextFromTag(tdList[2]).replace("in.","inches").strip() ##todo: down, up, ... returned[6] = getAllTextFromTag(tdList[3]).replace("°F","").strip() for r in returned: if r == None or r == "": return None return returned
def reverseZIPCodeLookup(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) res = testNoResults(soup) if RESULTS_DATA != res: return (res, None) # results (one? we handle more than one) tables = soup.fetch("table", {"summary": "Codes Results"}) if 0 == len(tables): return (UNKNOWN_FORMAT, m411UnknownFormatText) trList = [] for tab in tables: trList += tab.fetch("tr") for tr in trList: tdList = tr.fetch("td") if len(tdList) == 3: city = getAllTextFromTag(tdList[0]) country = getAllTextFromTag(tdList[1]) timezone = getAllTextFromTag(tdList[2]) if city != "New Search": smallList = (city, country, timezone) returned.append(smallList) elif len(tdList) == 2: # special case (911) city = getAllTextFromTag(tdList[0]) country = getAllTextFromTag(tdList[1]) if city != "New Search": smallList = (city, country, "") returned.append(smallList) if len(returned) == 0: return (UNKNOWN_FORMAT, m411UnknownFormatText) return (RESULTS_DATA, universalDataFormatReplaceEntities(returned))
def parseFirstDayHtmlYahoo(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) # sky, now, feelsLike, wind, hum, preasure, dev point, visibility returned = [None, None, None, None, None, None, None, None] fontList = soup.fetch("font", {"face": "Arial", "size": "2"}) list = [] wasFeelsLike = False for f in fontList: text = getAllTextFromTag(f).strip() if wasFeelsLike: list.append(text) else: if text == "Feels Like:": list.append(text) wasFeelsLike = True if len(list) >= 16: smallList = list[1::2] returned[0] = "" returned[1] = smallList[0].replace("°", "") returned[2] = smallList[0].replace("°", "") returned[3] = smallList[3] returned[4] = smallList[4].replace("%", "") returned[5] = smallList[2] returned[6] = smallList[1].replace("°", "") returned[7] = smallList[6] for r in returned: if r == None: return None return returned
def parseGasOld(htmlTxt, url=None, dbgLevel=0): soup = BeautifulSoup() soup.feed(htmlTxt) testTitle = soup.first("title") if testTitle: if getAllTextFromTag(testTitle).startswith("GasBuddy.com - Find cheap gas prices in your city"): return (LOCATION_UNKNOWN, gLocationUnknownText) outerList = [] trList = soup.fetch("tr") for trItem in trList: tdList = trItem.fetch("td") if 8 == len(tdList): if tdList[1].first("table"): price = getAllTextFromTag(tdList[0]).strip() name = getAllTextFromTag(tdList[2]).strip() address = getAllTextFromTag(tdList[4]).strip() area = getAllTextFromTag(tdList[5]).strip() time = getAllTextFromTag(tdList[6]).strip() smallList = [price, name, address, area, time] outerList.append(smallList) else: if 0 != len(tdList): firstB = tdList[0].first("b") if firstB: if getAllTextFromTag(firstB).startswith("No gas prices found."): return (NO_RESULTS, gNoResultsText) if 0 == len(outerList): if dbgLevel > 0: print "len(outerList)==0" return parsingFailed(url, htmlTxt) return (GAS_DATA, universalDataFormatReplaceEntities(outerList))
def reverseZIPCodeLookup(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) res = testNoResults(soup) if RESULTS_DATA != res: return (res, None) # results (one? we handle more than one) tables = soup.fetch("table", {"summary":"Codes Results"}) if 0 == len(tables): return (UNKNOWN_FORMAT,m411UnknownFormatText) trList = [] for tab in tables: trList += tab.fetch("tr") for tr in trList: tdList = tr.fetch("td") if len(tdList) == 3: city = getAllTextFromTag(tdList[0]) country = getAllTextFromTag(tdList[1]) timezone = getAllTextFromTag(tdList[2]) if city != "New Search": smallList = (city,country,timezone) returned.append(smallList) elif len(tdList) == 2: #special case (911) city = getAllTextFromTag(tdList[0]) country = getAllTextFromTag(tdList[1]) if city != "New Search": smallList = (city,country,"") returned.append(smallList) if len(returned) == 0: return (UNKNOWN_FORMAT,m411UnknownFormatText) return (RESULTS_DATA,universalDataFormatReplaceEntities(returned))
def parseName(htmlTxt): # this is funy htmlTxt = htmlTxt.replace("<! -- ", "<!---") soup = BeautifulSoup() soup.feed(htmlTxt) # no results fontList = soup.fetch("font", {"face": "arial"}) for fontItem in fontList: iItem = fontItem.first("i") if iItem: if str(iItem.contents[0]).startswith("Your search for"): return (NO_RESULTS, sNoResultsText) # get table data trList = soup.fetch("tr", {"bgcolor": "#ffffff"}) resultsCount = 0 outerList = [] for trItem in trList: tdList = trItem.fetch("td") if 5 == len(tdList): symbol = getAllTextFromTag(tdList[0]).strip() url = tdList[0].first("a")['href'] name = getAllTextFromTag(tdList[1]).strip() market = getAllTextFromTag(tdList[2]).strip() industry = getAllTextFromTag(tdList[3]).strip() outerList.append((url, symbol, name, market, industry)) resultsCount += 1 # no results? if 0 == resultsCount: return (NO_RESULTS, sNoResultsText) return (STOCKS_LIST, universalDataFormatReplaceEntities(outerList))
def tryParseDetails(htmlTxt, updateString): htmlTxt = removeCloseTagAttr(htmlTxt) soup = BeautifulSoup() soup.feed(htmlTxt) flightInfo = [] table = soup.first("table", {"name": "flight_info"}) trList = [] if table: for tr in table.fetch("tr"): if len(tr.fetch("td")) == 4: trList.append(tr) elif len(tr.fetch("td")) == 1: img = tr.first("img", {"alt": "Continuing on To"}) if img: trList.append(tr) for tr in trList: tdList = tr.fetch("td") if len(tdList) == 4: info = getAllTextFromTag(tdList[0]).replace(" ", " ").strip() infoFrom = getAllTextFromTag(tdList[1]).replace(" ", " ").strip() infoTo = getAllTextFromTag(tdList[3]).replace(" ", " ").strip() if info != "": flightInfo.append([info, infoFrom, infoTo]) else: flightInfo.append([""]) flight = "" table = soup.first("table", {"name": "headbar2"}) if table: bItem = table.first("b") if bItem: flight = getAllTextFromTag(bItem) if 0 == len(flightInfo) or "" == flight: return UNKNOWN_FORMAT, None # definition df = Definition() df.TextElement(flight, style=styleNameBold) df.LineBreakElement(1, 2) index = 0 for item in flightInfo: # info, from, to if len(item) == 3: df.TextElement(item[0], style=styleNameHeader) if item[1] != "": df.LineBreakElement() df.TextElement(item[1]) if item[2] != "": gtxt = df.TextElement(item[2]) gtxt.setJustification(justRight) else: df.LineBreakElement() else: df.HorizontalLineElement() return RESULTS_DATA, universalDataFormatWithDefinition( df, [["U", updateString]])
def _spider_book_info(url, letter): try: html = getHttp(url, handleException = False) soup = BeautifulSoup() soup.feed(html) h1 = soup.first("h1") if h1 is None: return None assert h1 is not None title = retrieveContents(h1).decode("iso-8859-1") subtitle = None author = None code = None labels = [retrieveContents(x) for x in soup.fetch("span", {"class": "title-label"})] data = soup.fetch("span", {"class": "title-data"}) try: index = labels.index("Subtitle") subtitle = retrieveContents(data[index]).decode("iso-8859-1") except ValueError: pass try: index = labels.index("Author") author = retrieveContents(data[index].first("a")).decode("iso-8859-1") except ValueError: pass try: index = labels.index("Language") href = str(data[index].first("a", {"href": "/language.php?code=%"})["href"]) code = href[19:href.find("&", 19)].decode("iso-8859-1") except ValueError: pass tid = soup.first("input", {"type": "hidden", "name": "tid"}) assert tid is not None book_id = tid["value"].decode("iso-8859-1") print (u"%s: \"%s\"" % (author, title)).encode("iso-8859-1", "ignore") sel = soup.first("select", {"name": "book"}) assert sel is not None opts = sel.fetch("option") formats = [] for opt in opts: try: format = retrieveContents(opt).split()[0] if format not in ebooks.FORMATS: continue val = opt["value"] formats.append((format, val)) except Exception, ex: log(SEV_EXC, exceptionAsStr(ex)) formats.sort() return (url, title, subtitle, author, book_id, code, formats)
def parseCurrencyData(htmlText): global _g_regionsToISO soup = BeautifulSoup() soup.feed(htmlText) tables = soup("table", { "border": "0", "width": "368", "cellpadding": "0", "cellspacing": "0" }) rows = tables[0].fetch("tr") currencies = dict() isFirstRow = True for row in rows: cells = row.fetch("td") if 4 != len(cells): continue if isFirstRow: isFirstRow = False continue region = retrieveContents(cells[1].fetch("div")[0].contents[0]) try: rate = float(retrieveContents( cells[3].fetch("div")[0].contents[0])) if "*" == retrieveContents(cells[0].contents[0]): rate = 1 / rate abbrev = _g_regionsToISO[region] currencies[abbrev] = rate except ValueError: pass return (RESULTS_DATA, currencies)
def parseList(htmlTxt): soup = BeautifulSoup() soup.feed(htmlTxt) # find table with results tableList = soup.fetch("table", {"cellpadding": "0", "cellspacing": "0", "border": "0", "width": "100%"}) if 0 == len(tableList): return (UNKNOWN_FORMAT, jUnknownFormatText) outerList = [] for table in tableList: trList = table.fetch("tr") if 2 <= len(trList): tdCount = len(trList[0].fetch("td")) if 3 > tdCount: return (UNKNOWN_FORMAT, jUnknownFormatText) for tr in trList[1:]: tdList = tr.fetch("td") rank = "" if 4 == tdCount: rank = getAllTextFromTag(tdList[0]) title = getAllTextFromTag(tdList[-3]) rating = getAllTextFromTag(tdList[-2]) explicitness = getAllTextFromTag(tdList[-1]) url = tdList[-3].first("a")["href"] if not url: return (UNKNOWN_FORMAT, jUnknownFormatText) outerList.append((rank, title, rating, explicitness, url)) if 0 == len(outerList): return (NO_RESULTS, jNoResultsText) return (JOKES_LIST, universalDataFormatReplaceEntities(outerList))
def parseCurrencyData(htmlText): global _g_imgRe soup = BeautifulSoup() soup.feed(htmlText) # <table width="60%" border="0" cellpadding="3" summary="Displays latest tourist currency rates"> table = soup.first("table", {"border": "0", "width": "60%", "cellpadding": "3", "summary": "Displays latest tourist currency rates"}) assert table is not None tbody = table.first("tbody") assert tbody is not None rows = tbody.fetch("tr") currencies = dict() for row in rows: cells = row.fetch("td") img = cells[0].fetch("img")[0] match = _g_imgRe.match(img["src"]) if match is None: continue abbrev = match.group(1) rate = float(str(cells[2].contents[0]).strip().split()[0]) currencies[abbrev] = rate usdRate = currencies["USD"] for key in currencies.iterkeys(): currencies[key] = currencies[key] / usdRate assert 1 == currencies["USD"] return (RESULTS_DATA, currencies)
def reverseAreaCodeLookup(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) noResults = m411testNoResults(soup) if NO_RESULTS == noResults: return (noResults,m411NoResultsText) # results tableList = soup.fetch("table", {"id":"listings"}) if len(tableList) != 1: return UNKNOWN_FORMAT, None trList = tableList[0].fetch("tr") if len(trList) == 0: return UNKNOWN_FORMAT, None # ignore headers ([1:]) for trItem in trList[1:]: if 0 == len(trItem.fetch("tr")): # they have sth screwed with this <tr><tr> .... </tr></tr> tdList = trItem.fetch("td", {"id":"subtextid"}) if 3 == len(tdList): city = getAllTextFromTag(tdList[0]) country = getAllTextFromTag(tdList[1]) timezone = getAllTextFromTag(tdList[2]) smallList = (city,country,timezone) returned.append(smallList) elif 2 == len(tdList): city = getAllTextFromTag(tdList[0]) country = "" timezone = getAllTextFromTag(tdList[1]) smallList = (city,country,timezone) returned.append(smallList) if 0 == len(returned): return (UNKNOWN_FORMAT,m411UnknownFormatText) return (RESULTS_DATA,universalDataFormatReplaceEntities(returned))
def reversePhoneLookupWhitepages(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) noResults = m411testNoResultsReversePhoneLookup(soup) if NO_RESULTS == noResults: return (noResults,m411NoResultsText) div = soup.first("div", {"class":"listings"}) if div: for table in div.fetch("table"): for tr in table.fetch("tr"): text1 = tr.first("div",{"class":"textb"}) text2 = tr.first("div",{"class":"text"}) if text1 and text2: name = getAllTextFromTag(text1) cont = getAllTextFromToInBrFormat(text2, getLastElementFromTag(text2).next) parts = cont.split("<br>") (address,city,phone) = ("","","") if len(parts) == 3: (address,city,phone) = parts if len(parts) == 2: (city,phone) = parts if len(parts) == 4: (prefix,address,city,phone) = parts returned.append((name,address.strip(),city.strip(),phone.strip())) if len(returned) == 0: return UNKNOWN_FORMAT, None return (RESULTS_DATA,universalDataFormatReplaceEntities(returned))
def mailsubject(self): p = Parser() p.feed(self._imp) p.goahead(0) div = p.find('div', attrs={'class':'moji'}) #FIXME assuming BeautifulSoup uses utf8 return str(div.contents[0]).decode('utf8')
def internationalCodeSearch(htmlTxt): result = [] soup = BeautifulSoup() soup.feed(htmlTxt) noResults = m411testNoResults(soup) if NO_RESULTS == noResults: return (noResults,m411NoResultsText) # results tableList = soup.fetch("table",{"id":"listings"}) if len(tableList) != 1: return UNKNOWN_FORMAT, None trList = tableList[0].fetch("tr") cityCodeList = [] for trItem in trList: if 0 == len(trItem.fetch("tr")): tdList = trItem.fetch("td", {"id":"subtextid"}) if 2 == len(tdList): if 0 == len(result): result.append([getAllTextFromTag(tdList[1])]) else: city = getAllTextFromTag(tdList[0]) code = getAllTextFromTag(tdList[1]) cityCodeList.append((city,code)) # sort the (city,code) list by city cityCodeList.sort(sortByCityFunc) for el in cityCodeList: result.append(el) if 0 == len(result): return UNKNOWN_FORMAT, None return (RESULTS_DATA,universalDataFormatReplaceEntities(result))
def internationalCodeSearch(htmlTxt): result = [] soup = BeautifulSoup() soup.feed(htmlTxt) res = testNoResults(soup) if RESULTS_DATA != res: return (res, None) # Country code(s) tableList = soup.fetch("table", {"summary":"Codes Results"}) if len(tableList) != 1: return (UNKNOWN_FORMAT,m411UnknownFormatText) # found country code tdListA = tableList[0].fetch("td",{"style":"%padding-left:5px;"}) tdListB = tableList[0].fetch("td",{"style":"%line-height:14pt;"}) if len(tdListA) != len(tdListB): return (UNKNOWN_FORMAT,m411UnknownFormatText) cityCodeList = [] for i in range(len(tdListA)): if 0 == i: result.append([getAllTextFromTag(tdListB[i])]) else: city = getAllTextFromTag(tdListA[i]) code = getAllTextFromTag(tdListB[i]) cityCodeList.append((city,code)) # sort the (city,code) list by city cityCodeList.sort(sortByCityFunc) for el in cityCodeList: result.append(el) return (RESULTS_DATA,universalDataFormatReplaceEntities(result))
def parseMultiselect(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) aList = soup.fetch("a", {"href": "/weather/local/%"}) aList += soup.fetch("a", {"href": "/outlook/travel/local/%"}) aList += soup.fetch("a", {"href": "/outlook/travel/businesstraveler/local/%"}) lastCode = "" resultsCount = 0 for aItem in aList: afterLocal = aItem['href'].split("local/") if 2 == len(afterLocal): textAfterLocal = afterLocal[1] if 8 < len(textAfterLocal): code = textAfterLocal[:8] textAfterLocal = textAfterLocal[8:] if textAfterLocal.startswith("?from=search_"): if -1 == lastCode.find(code): lastCode += code text = getAllTextFromTag(aItem) resultsCount += 1 returned.append((text, code)) if 0 == resultsCount: return (LOCATION_UNKNOWN, None) return (LOCATION_MULTISELECT, universalDataFormatReplaceEntities(returned))
def parseFirstDayHtmlYahoo(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) # sky, now, feelsLike, wind, hum, preasure, dev point, visibility returned = [None, None, None, None, None, None, None, None] fontList = soup.fetch("font",{"face":"Arial", "size":"2"}) list = [] wasFeelsLike = False for f in fontList: text = getAllTextFromTag(f).strip() if wasFeelsLike: list.append(text) else: if text == "Feels Like:": list.append(text) wasFeelsLike = True if len(list) >= 16: smallList = list[1::2] returned[0] = "" returned[1] = smallList[0].replace("°","") returned[2] = smallList[0].replace("°","") returned[3] = smallList[3] returned[4] = smallList[4].replace("%","") returned[5] = smallList[2] returned[6] = smallList[1].replace("°","") returned[7] = smallList[6] for r in returned: if r == None: return None return returned
def parseCurrencyData(htmlText): global _g_imgRe soup = BeautifulSoup() soup.feed(htmlText) # <table width="60%" border="0" cellpadding="3" summary="Displays latest tourist currency rates"> table = soup.first( "table", { "border": "0", "width": "60%", "cellpadding": "3", "summary": "Displays latest tourist currency rates" }) assert table is not None tbody = table.first("tbody") assert tbody is not None rows = tbody.fetch("tr") currencies = dict() for row in rows: cells = row.fetch("td") img = cells[0].fetch("img")[0] match = _g_imgRe.match(img["src"]) if match is None: continue abbrev = match.group(1) rate = float(str(cells[2].contents[0]).strip().split()[0]) currencies[abbrev] = rate usdRate = currencies["USD"] for key in currencies.iterkeys(): currencies[key] = currencies[key] / usdRate assert 1 == currencies["USD"] return (RESULTS_DATA, currencies)
def parseMultiselect(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) aList = soup.fetch("a", {"href":"/weather/local/%"}) aList += soup.fetch("a", {"href":"/outlook/travel/local/%"}) aList += soup.fetch("a", {"href":"/outlook/travel/businesstraveler/local/%"}) lastCode = "" resultsCount = 0 for aItem in aList: afterLocal = aItem['href'].split("local/") if 2 == len(afterLocal): textAfterLocal = afterLocal[1] if 8 < len(textAfterLocal): code = textAfterLocal[:8] textAfterLocal = textAfterLocal[8:] if textAfterLocal.startswith("?from=search_"): if -1 == lastCode.find(code): lastCode += code text = getAllTextFromTag(aItem) resultsCount += 1 returned.append((text,code)) if 0 == resultsCount: return (LOCATION_UNKNOWN,None) return (LOCATION_MULTISELECT,universalDataFormatReplaceEntities(returned))
def parseFirstDayHtml(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) # sky, now, feelsLike, wind, hum, preasure, dev point, visibility returned = [None, None, None, None, None, None, None, "N/A"] bItems = soup.fetch("b", {"class": "obsTextA"}) if len(bItems) == 2: returned[0] = getAllTextFromTag(bItems[0]).strip() temp = getAllTextFromTag(bItems[1]).strip().split("Like ") if len(temp) > 1: returned[2] = temp[1].replace("°F", "").strip() bItem = soup.first("b", {"class": "obsTempTextA"}) if bItem: returned[1] = getAllTextFromTag(bItem).replace("°F", "").strip() tdList = soup.fetch("td", {"class": "obsTextA"}) if len(tdList) == 8: tdList = tdList[1::2] assert (len(tdList) == 4) returned[3] = getAllTextFromTag(tdList[0]).strip() returned[4] = getAllTextFromTag(tdList[1]).replace("%", "").strip() returned[5] = getAllTextFromTag(tdList[2]).replace( "in.", "inches").strip() ##todo: down, up, ... returned[6] = getAllTextFromTag(tdList[3]).replace("°F", "").strip() for r in returned: if r == None or r == "": return None return returned
def mailbody(self): p = Parser() p.feed(self._imp) p.goahead(0) td = p.find('td', attrs={'class':'moji'}) #FIXME assuming BeautifulSoup uses utf8 u = str(td).decode('utf8') return tagStrip.sub(replproc, u)
def parseDream2(htmlTxt): soup = BeautifulSoup() # TODO: this is temporary: htmlTxt = htmlTxt.replace( "/*<![CDATA[*/ @import \"/knowledge/stylesheets/monobook/main.css\"; /*]]>*/", "") soup.feed(htmlTxt) tableMain = soup.fetch("table", { "width": "768", "align": "center", "cellspacing": "0", "cellpadding": "0" }) if not tableMain: return (UNKNOWN_FORMAT, dUnknownFormatText) td = None for table in tableMain: tr = table.first("tr") if tr: tdTest = tr.first("td", {"width": "100%", "valign": "top"}) if tdTest: td = tdTest if not td: return (UNKNOWN_FORMAT, dUnknownFormatText) # why without this it is not working? soup2 = BeautifulSoup() soup2.feed(str(td).replace("<br />>", "")) td = soup2.first("td") # no results? if td.first("center"): return (NO_RESULTS, dNoResultsText) # results bTable = td.fetch("b") if not bTable: return (UNKNOWN_FORMAT, dUnknownFormatText) outerList = [] for bItem in bTable: title = getAllTextFromTag(bItem) next = getLastElementFromTag(bItem) pItem = None while next and not pItem: if isinstance(next, Tag): if next.name == "p": pItem = next next = next.next if pItem: text = getAllTextFromTagWithA(pItem.first("font")) if text.startswith("Interpretation: "): text = text[len("Interpretation: "):] outerList.append((title, text)) if 0 == len(outerList): return (NO_RESULTS, dNoResultsText) return (DREAM_DATA, universalDataFormatReplaceEntities(outerList))
def tryParseDetails(htmlTxt, updateString): htmlTxt = removeCloseTagAttr(htmlTxt) soup = BeautifulSoup() soup.feed(htmlTxt) flightInfo = [] table = soup.first("table", {"name":"flight_info"}) trList = [] if table: for tr in table.fetch("tr"): if len(tr.fetch("td")) == 4: trList.append(tr) elif len(tr.fetch("td")) == 1: img = tr.first("img", {"alt":"Continuing on To"}) if img: trList.append(tr) for tr in trList: tdList = tr.fetch("td") if len(tdList)==4: info = getAllTextFromTag(tdList[0]).replace(" "," ").strip() infoFrom = getAllTextFromTag(tdList[1]).replace(" "," ").strip() infoTo = getAllTextFromTag(tdList[3]).replace(" "," ").strip() if info != "": flightInfo.append([info, infoFrom, infoTo]) else: flightInfo.append([""]) flight = "" table = soup.first("table", {"name":"headbar2"}) if table: bItem = table.first("b") if bItem: flight = getAllTextFromTag(bItem) if 0==len(flightInfo) or ""==flight: return UNKNOWN_FORMAT, None # definition df = Definition() df.TextElement(flight, style=styleNameBold) df.LineBreakElement(1,2) index = 0 for item in flightInfo: # info, from, to if len(item) == 3: df.TextElement(item[0], style=styleNameHeader) if item[1] != "": df.LineBreakElement() df.TextElement(item[1]) if item[2] != "": gtxt = df.TextElement(item[2]) gtxt.setJustification(justRight) else: df.LineBreakElement() else: df.HorizontalLineElement() return RESULTS_DATA, universalDataFormatWithDefinition(df, [["U",updateString]])
def getTextFromDirtyText(dirtyText): soup = BeautifulSoup() soup.feed("<xxx>" + dirtyText + "</xxx><yyy>test</yyy>") dirtySoup = soup.first("xxx") textWithBr = getAllTextFromToInBrFormat( dirtySoup, getLastElementFromTag(dirtySoup).next) text = textWithBr.replace("<br>", "\n").replace("<b>", "").replace("</b>", "") return text
def personSearch(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) noResults = m411testNoResults(soup) if NO_RESULTS == noResults: return (noResults,m411NoResultsText) ttlPref = soup.first("td",{"class":"TTLPREF"}) if not ttlPref: ttlPref = soup.first("span",{"class":"TTLPREF"}) if not ttlPref: return (UNKNOWN_FORMAT,m411UnknownFormatText) # too many results: font = ttlPref.first("font",{"color":"#FF0000"}) if font: if "No results." == font.contents[0]: return (NO_RESULTS,m411NoResultsText) if "Results found in multiple cities." == font.contents[0]: brList = ttlPref.fetch("br") brList = brList[4:] ## skip text about select for br in brList: text = str(br.next).replace("<br />","").replace("\n","").strip() if len(text) > 0: returned.append(text) return (MULTIPLE_SELECT, string.join(returned,"\n")) return (TOO_MANY_RESULTS,m411TooManyResults) # results: brList = ttlPref.fetch("br") resultsCount = len(brList) - 2 if 0 == resultsCount: # no city? if "NO CITY FOUND" == str(brList[1].next).replace("\n","").strip(): return (NO_CITY,m411NoCity) results = resultsCount/5 if results*5 != resultsCount: ## test if number of <br> is 5*n+2 return (UNKNOWN_FORMAT,m411UnknownFormatText) # get them brList = brList[1:] ## skip first br counter = 0 smallList = [] for br in brList: text = str(br.next).replace("<br />","").replace("\n","").strip() if results > 0: if 0 == counter: smallList = [text] if 1 == counter or 2 == counter: smallList.append(text) if 3 == counter: smallList.append(text) returned.append(smallList) results -= 1 counter += 1 if 5 == counter: counter = 0 return (RESULTS_DATA,universalDataFormatReplaceEntities(returned))
def _parseRandomJoke(htmlTxt): soup = BeautifulSoup() soup.feed(htmlTxt) table = soup.first("table", {"id": "jokeIframeTable2"}) if not table: return UNKNOWN_FORMAT, None # title titleSpan = table.first("span", {"class": "jokeTitle_v2"}) if not titleSpan: return UNKNOWN_FORMAT, None title = getAllTextFromTag(titleSpan) # text trList = table.fetch("tr") text = "" if len(trList) > 6: tdList = trList[5].fetch("td") if len(tdList) == 3: text = getAllTextFromToInBrFormat(tdList[1], tdList[2]) if len(text.replace(" ", " ").strip()) < 2: text = "" if "" == text: return UNKNOWN_FORMAT, None smallList = [title, text] # rating table = soup.first("table", {"id": "Table5"}) if table: td = table.first("td") if td: imgList = td.fetch("img", {"src": "%"}) rating = "not rated" translator = { "iconrate_one": "1", "iconrate_two": "2", "iconrate_three": "3", "iconrate_four": "4", "iconrate_five": "5", "iconrate_one_half": "1.5", "iconrate_two_half": "2.5", "iconrate_three_half": "3.5", "iconrate_four_half": "4.5", "iconrate_zero_half": "0.5", } for img in imgList: src = img['src'] src = src.split("/")[-1] src = src.replace(".gif", "") try: rat = translator[src] rating = rat except: pass smallList.append(rating) outerList = [smallList] return (JOKE_DATA, universalDataFormatReplaceEntities(outerList))
def _parseRandomJoke(htmlTxt): soup = BeautifulSoup() soup.feed(htmlTxt) table = soup.first("table", {"id": "jokeIframeTable2"}) if not table: return UNKNOWN_FORMAT, None # title titleSpan = table.first("span", {"class": "jokeTitle_v2"}) if not titleSpan: return UNKNOWN_FORMAT, None title = getAllTextFromTag(titleSpan) # text trList = table.fetch("tr") text = "" if len(trList) > 6: tdList = trList[5].fetch("td") if len(tdList) == 3: text = getAllTextFromToInBrFormat(tdList[1], tdList[2]) if len(text.replace(" ", " ").strip()) < 2: text = "" if "" == text: return UNKNOWN_FORMAT, None smallList = [title, text] # rating table = soup.first("table", {"id": "Table5"}) if table: td = table.first("td") if td: imgList = td.fetch("img", {"src": "%"}) rating = "not rated" translator = { "iconrate_one": "1", "iconrate_two": "2", "iconrate_three": "3", "iconrate_four": "4", "iconrate_five": "5", "iconrate_one_half": "1.5", "iconrate_two_half": "2.5", "iconrate_three_half": "3.5", "iconrate_four_half": "4.5", "iconrate_zero_half": "0.5", } for img in imgList: src = img["src"] src = src.split("/")[-1] src = src.replace(".gif", "") try: rat = translator[src] rating = rat except: pass smallList.append(rating) outerList = [smallList] return (JOKE_DATA, universalDataFormatReplaceEntities(outerList))
def parse(wordtosearch): url = 'http://dictionary.reference.com/search?q=' + wordtosearch # Read the URL and pass it to BeautifulSoup. html = urllib2.urlopen(url).read() soup = BeautifulSoup() soup.feed(html) # Read the main table, extracting the words from the table cells. maintable = soup.fetch('li') # There are 6 lines containg <li> at the bottom that we don't want to print # So we remove them from the list by adjustin the count removeli = len(maintable) - 6 counter = 0 # if removeli is 0 then we need to look for dl tags if removeli == 0: # fetch dl tags maintable = soup.fetch('dl') for defs in maintable: converttostring = str(defs) splitstring = converttostring.split('<dd>') removetrash = re.sub( '^ |</dd.*dl>|<li.*">|<ol.*">|<cite> </li>|<cite>|</cite>|</ol></li>|</li>|<a.*/a>', '', splitstring[1]) addunderscores = re.sub('<u><i>|</i></u>', '_', removetrash) convertampersands = re.sub('&', '&', addunderscores) definition = convertampersands print definition else: for counter in range(removeli): defs = maintable[counter] converttostring = str(defs) splitstring = converttostring.split('<li>') if len(splitstring) != 1: removetrash = re.sub( '^ |(<li.*">|<ol.*">|<cite> </li>|<cite>|</cite>|</ol></li>|</li>|<a.*/a>)', '', splitstring[1]) addunderscores = re.sub('(<u><i>|</i></u>)', '_', removetrash) convertampersands = re.sub('&', '&', addunderscores) definition = convertampersands print definition else: removetrash = re.sub( '^ |<li.*">|<ol.*">|<cite> </li>|<cite>|</cite>|</ol></li>|</li>|<a.*/a>', '', splitstring[0]) addunderscores = re.sub('<u><i>|</u></i>', '_', removetrash) convertampersands = re.sub('&', '&', addunderscores) definition = convertampersands print definition counter += 1
def _parse_letter_page(self, letter, html, index): self._check_finish() soup = BeautifulSoup() soup.feed(html) div = soup.first("div", {"class": "sidebar-module"}) assert div is not None count = int(retrieveContents(div.contents[2]).split()[2]) offset = 0 self._lock.acquire() try: if count <= self._data[letter][0]: print 'Letter "%s" is up to date (%d records).' % (letter, self._data[letter][0]) return True, count, 0 offset = self._offsets[letter] finally: self._lock.release() spidered = 0 div = soup.first("div", {"class": "titleList"}) assert div is not None as = div.fetch("a") urls = [] for a in as: url = _g_manybooks_url + urllib.quote(a["href"]) urls.append(url) for url in urls: self._check_finish() i = -1 self._lock.acquire() try: books = self._data[letter][1] i = _find_book_index(books, url, index) finally: self._lock.release() if -1 != i: index = i + 1 else: book = _spider_book_info(url, letter) if book is not None: spidered += 1 self._lock.acquire() try: self._fresh_books.append((letter, index + offset, book)) if len(self._fresh_books) == self.flush_after: self._flush_books() offset += 1 self._offsets[letter] = offset if self._data[letter][0] + offset == count: return True, count, spidered finally: self._lock.release() return (index + offset == count), index, spidered
def parseCurrencyData(htmlText): global _g_metaRe htmlText = _g_metaRe.sub("", htmlText) soup = BeautifulSoup() soup.feed(htmlText) tables = soup("table", {"cellpadding": "2", "cellspacing": "0", "border": "0", "width": "468"}) assert 1 == len(tables) rows = tables[0].fetch("tr", {"bgcolor": "#FFFFFF"}) currencies = dict() for row in rows: fonts = row.fetch("font") currencies[str(fonts[2].contents[0])] = float(str(fonts[3].contents[0])) return (RESULTS_DATA, currencies)
def parseJoke(htmlTxt): soup = BeautifulSoup() soup.feed(htmlTxt) table = soup.first("table", {"width": "328", "id": "Table2"}) if not table: return (UNKNOWN_FORMAT, jUnknownFormatText) tdList = table.fetch("td", { "colspan": "3", "valign": "top", "class": "body" }) if 3 != len(tdList): return (UNKNOWN_FORMAT, jUnknownFormatText) # simple format - simple parser title = getAllTextFromTag(tdList[0]).strip() text = getAllTextFromToInBrFormat(tdList[1], tdList[2].previous) smallList = [title, text] # add rating information if len(title) + len( text ) > 16: # in random joke sometimes it returns small nothing... so to be sure span = soup.first("span", {"class": "body"}) if span: text = getAllTextFromTag(span).replace("\n", "").strip() img = span.first("img", {"src": "%"}) if text.startswith("CURRENT RATING") and img: src = img['src'] src = src.split("/")[-1] src = src.replace(".gif", "") translator = { "iconrate_one": "1", "iconrate_two": "2", "iconrate_three": "3", "iconrate_four": "4", "iconrate_five": "5", "iconrate_one_half": "1.5", "iconrate_two_half": "2.5", "iconrate_three_half": "3.5", "iconrate_four_half": "4.5", "iconrate_zero_half": "0.5", } rating = "not rated" try: rating = translator[src] except: pass smallList.append(rating) outerList = [smallList] return (JOKE_DATA, universalDataFormatReplaceEntities(outerList))
def parseStock(htmlTxt): # this is funy htmlTxt = htmlTxt.replace("<! -- ", "<!---") soup = BeautifulSoup() soup.feed(htmlTxt) noResults = testNoResults(soup) if NO_RESULTS == noResults: return (NO_RESULTS, sNoResultsText) # get name nameTag = soup.first("td", {"height": "30", "class": "ygtb"}) if not nameTag: return (UNKNOWN_FORMAT, sUnknownFormatText) name = getAllTextFromTag(nameTag).strip() # get all data from table bigTable = soup.fetch("table", {"width": "580", "id": "yfncsumtab"}) if 1 != len(bigTable): return (UNKNOWN_FORMAT, sUnknownFormatText) tdDataList = bigTable[0].fetch("td", {"class": "yfnc_tabledata1"}) innerList = [name] counter = 0 for tdItem in tdDataList: if 2 == counter: # 3th element is with up down icon imgItem = tdDataList[2].first("img") upDown = "" if imgItem: upDown = imgItem['alt'] innerList.append(upDown) bItem = tdDataList[2].first("b") itemText = "" if bItem: itemText = getAllTextFromTag(bItem).strip() innerList.append(itemText) else: itemText = getAllTextFromTag(tdItem).strip() innerList.append(itemText) counter += 1 # any results? if 0 == counter: return (UNKNOWN_FORMAT, sUnknownFormatText) # one-item UDF outerList = [innerList] return (STOCKS_DATA, universalDataFormatReplaceEntities(outerList))
def parse(wordtosearch): url = 'http://dictionary.reference.com/search?q=' + wordtosearch # Read the URL and pass it to BeautifulSoup. html = urllib2.urlopen(url).read() soup = BeautifulSoup() soup.feed(html) # Read the main table, extracting the words from the table cells. maintable = soup.fetch('li') # There are 6 lines containg <li> at the bottom that we don't want to print # So we remove them from the list by adjustin the count removeli = len(maintable) - 6 counter = 0 # if removeli is 0 then we need to look for dl tags if removeli == 0: # fetch dl tags maintable = soup.fetch('dl') for defs in maintable: converttostring = str(defs) splitstring = converttostring.split('<dd>') removetrash = re.sub('^ |</dd.*dl>|<li.*">|<ol.*">|<cite> </li>|<cite>|</cite>|</ol></li>|</li>|<a.*/a>', '', splitstring[1]) addunderscores = re.sub('<u><i>|</i></u>', '_', removetrash) convertampersands = re.sub('&', '&', addunderscores) definition = convertampersands print definition else: for counter in range(removeli): defs = maintable[counter] converttostring = str(defs) splitstring = converttostring.split('<li>') if len(splitstring) != 1: removetrash = re.sub('^ |(<li.*">|<ol.*">|<cite> </li>|<cite>|</cite>|</ol></li>|</li>|<a.*/a>)', '', splitstring[1]) addunderscores = re.sub('(<u><i>|</i></u>)', '_', removetrash) convertampersands = re.sub('&', '&', addunderscores) definition = convertampersands print definition else: removetrash = re.sub('^ |<li.*">|<ol.*">|<cite> </li>|<cite>|</cite>|</ol></li>|</li>|<a.*/a>', '', splitstring[0]) addunderscores = re.sub('<u><i>|</u></i>', '_', removetrash) convertampersands = re.sub('&', '&', addunderscores) definition = convertampersands print definition counter += 1
def parseCurrencyData(htmlText): global _g_xercRe htmlText = _g_xercRe.sub("", htmlText) htmlText = htmlText.replace("!BORDERCOLOR", "BORDERCOLOR") soup = BeautifulSoup() soup.feed(htmlText) tables = soup("table", {"class": "ictab"}) assert 1 == len(tables) rows = tables[0].fetch("tr", {"valign": "top"}) currencies = dict() for row in rows: cells = row.fetch("td") if 4 != len(cells): continue currencies[str(cells[0].contents[0]).strip()] = float(str(cells[3].contents[0]).strip().replace(",", "")) return (RESULTS_DATA, currencies)
def parseCurrencyData(htmlText): global _g_xercRe htmlText = _g_xercRe.sub("", htmlText) htmlText = htmlText.replace("!BORDERCOLOR", "BORDERCOLOR") soup = BeautifulSoup() soup.feed(htmlText) tables = soup("table", {"class": "ictab"}) assert 1 == len(tables) rows = tables[0].fetch("tr", {"valign": "top"}) currencies = dict() for row in rows: cells = row.fetch("td") if 4 != len(cells): continue currencies[str(cells[0].contents[0]).strip()] = float( str(cells[3].contents[0]).strip().replace(",", "")) return (RESULTS_DATA, currencies)
def reversePhoneLookup(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) noResults = m411testNoResultsReversePhoneLookup(soup) if NO_RESULTS == noResults: return (noResults,m411NoResultsText) tdWithResults = soup.first("td",{"class":"TTLPREF"}) if not tdWithResults: tdWithResults = soup.first("span",{"class":"TTLPREF"}) if not tdWithResults: return (UNKNOWN_FORMAT,m411UnknownFormatText) # results are inside <td> fontColor = tdWithResults.first("font") if fontColor: # "No details available." counter = 0 for br in tdWithResults.fetch("br"): # we belive that after 6th <br> is city if counter == 5: city = "%s" % str(br.next).replace("\n","").strip() returned.append(["","",city,""]) counter += 1 else: # all data, or city & phone counter = 0 person = "" address = "" city = "" phone = "" for br in tdWithResults.fetch("br"): # 7 <br> in <td> if 1 == counter: if not isinstance(br.next,Tag): person = "%s" % str(br.next).replace("\n","").strip() if 2 == counter: if not isinstance(br.next,Tag): address = "%s" % str(br.next).replace("\n","").strip() if 3 == counter: city = "%s" % str(br.next).replace("\n","").strip() if 4 == counter: phone = "%s" % str(br.next).replace("\n","").strip() counter += 1 returned.append((person,address,city,phone)) return (RESULTS_DATA,universalDataFormatReplaceEntities(returned))
def parseRandomQuotes(htmlTxt, modulesInfo): soup = BeautifulSoup() soup.feed(htmlTxt) quotes = [] dtList = soup.fetch("dt", {"class":"quote"}) ddList = soup.fetch("dd", {"class":"author"}) if len(dtList) == len(ddList) and len(dtList) > 0: for i in range(len(ddList)): quote = getAllTextFromTag(dtList[i]) next = ddList[i] bItem = None while next and None == bItem: next = next.next if isinstance(next, Tag): if next.name == "b": bItem = next elif next.name == "dt": next = None elif next.name == "select": next = None if bItem: aItem = bItem.first("a") if aItem: author = getAllTextFromTag(aItem) else: author = getAllTextFromTag(bItem) quotes.append([author, "\""+quote.strip()+"\""]) if 0 == len(quotes): return UNKNOWN_FORMAT, None # build definition df = Definition() te = df.TextElement("Random Quotes", style=styleNamePageTitle) te.setJustification(justCenter) df.LineBreakElement() addQuotesToDefinition(df, quotes, modulesInfo) df.LineBreakElement() par = df.ParagraphElement(False) par.setJustification(justCenter) df.TextElement("Daily", link="s+quotes:daily") df.TextElement(" \x95 ", style=styleNameGray) df.TextElement("Random", link="s+quotes:random") df.PopParentElement() return QUOTES_DATA, universalDataFormatWithDefinition(df, [])
def parseJoke(htmlTxt): soup = BeautifulSoup() soup.feed(htmlTxt) table = soup.first("table", {"width": "328", "id": "Table2"}) if not table: return (UNKNOWN_FORMAT, jUnknownFormatText) tdList = table.fetch("td", {"colspan": "3", "valign": "top", "class": "body"}) if 3 != len(tdList): return (UNKNOWN_FORMAT, jUnknownFormatText) # simple format - simple parser title = getAllTextFromTag(tdList[0]).strip() text = getAllTextFromToInBrFormat(tdList[1], tdList[2].previous) smallList = [title, text] # add rating information if len(title) + len(text) > 16: # in random joke sometimes it returns small nothing... so to be sure span = soup.first("span", {"class": "body"}) if span: text = getAllTextFromTag(span).replace("\n", "").strip() img = span.first("img", {"src": "%"}) if text.startswith("CURRENT RATING") and img: src = img["src"] src = src.split("/")[-1] src = src.replace(".gif", "") translator = { "iconrate_one": "1", "iconrate_two": "2", "iconrate_three": "3", "iconrate_four": "4", "iconrate_five": "5", "iconrate_one_half": "1.5", "iconrate_two_half": "2.5", "iconrate_three_half": "3.5", "iconrate_four_half": "4.5", "iconrate_zero_half": "0.5", } rating = "not rated" try: rating = translator[src] except: pass smallList.append(rating) outerList = [smallList] return (JOKE_DATA, universalDataFormatReplaceEntities(outerList))
def parseRandomQuotes(htmlTxt, modulesInfo): soup = BeautifulSoup() soup.feed(htmlTxt) quotes = [] dtList = soup.fetch("dt", {"class": "quote"}) ddList = soup.fetch("dd", {"class": "author"}) if len(dtList) == len(ddList) and len(dtList) > 0: for i in range(len(ddList)): quote = getAllTextFromTag(dtList[i]) next = ddList[i] bItem = None while next and None == bItem: next = next.next if isinstance(next, Tag): if next.name == "b": bItem = next elif next.name == "dt": next = None elif next.name == "select": next = None if bItem: aItem = bItem.first("a") if aItem: author = getAllTextFromTag(aItem) else: author = getAllTextFromTag(bItem) quotes.append([author, "\"" + quote.strip() + "\""]) if 0 == len(quotes): return UNKNOWN_FORMAT, None # build definition df = Definition() te = df.TextElement("Random Quotes", style=styleNamePageTitle) te.setJustification(justCenter) df.LineBreakElement() addQuotesToDefinition(df, quotes, modulesInfo) df.LineBreakElement() par = df.ParagraphElement(False) par.setJustification(justCenter) df.TextElement("Daily", link="s+quotes:daily") df.TextElement(" \x95 ", style=styleNameGray) df.TextElement("Random", link="s+quotes:random") df.PopParentElement() return QUOTES_DATA, universalDataFormatWithDefinition(df, [])
def get(self, url, now=None): #ym=2009.3 #vmode=itiran if now is None: now = dt.now() tomorrow = now + delta(1) option = urllib.urlencode(dict( ym='%i.%i'%(now.year, now.month), vmode='itiran' )) url += '?' + option self.write('getting feed from "%s".\n'%(url)) p = Parser() f = urllib.urlopen(url) try: p.feed(f.read().decode('Shift-JIS')) p.goahead(0) finally: f.close() pages = [] for a in p.findAll('a', attrs=dict(href="javaScript:void(0)")): m = parseOnClick.search(a['onclick']) if m: d = m.groupdict() #print d['year'], d['month'], d['day'], d['id'] memo = '''http://www.backgammon.gr.jp/EventSchedule/calendar/calendar.cgi''' if int(d['day']) == tomorrow.day: option = urllib.urlencode(dict( action='memo', yy=d['year'], mm=d['month'], dd=d['day'], id=d['id'], )) f = urllib.urlopen(memo + '?' + option) try: uhtml = f.read().decode('Shift-JIS') finally: f.close() pages.append(Item(self, uhtml)) return pages
def parseCurrencyData(htmlText): global _g_metaRe htmlText = _g_metaRe.sub("", htmlText) soup = BeautifulSoup() soup.feed(htmlText) tables = soup("table", { "cellpadding": "2", "cellspacing": "0", "border": "0", "width": "468" }) assert 1 == len(tables) rows = tables[0].fetch("tr", {"bgcolor": "#FFFFFF"}) currencies = dict() for row in rows: fonts = row.fetch("font") currencies[str(fonts[2].contents[0])] = float(str( fonts[3].contents[0])) return (RESULTS_DATA, currencies)
def cve(self,irc,msg,args): word= self._prepare_term(args[0],"-") if re.search('cve', word, re.IGNORECASE) == None: url = 'http://cve.mitre.org/cgi-bin/cvekey.cgi?keyword=' + word category = 'keyword' else: url = 'http://cve.mitre.org/cgi-bin/cvename.cgi?name=' +word category = 'name' # Read the URL and pass it to BeautifulSoup. html = urllib2.urlopen(url).read() soup = BeautifulSoup() soup.feed(html) cveroot = "http://cve.mitre.org" # Read the main table, extracting the words from the table cells. hreftable = soup.fetch('a', {'href':re.compile('cvename')}, limit=4) h1table = soup.fetch('h1') h1string = str(h1table) if category == 'keyword': fonttable = soup.fetch('font', limit=11) else: fonttable = soup.fetch('font', limit=17) if (len(fonttable) == 3) or (re.search('error', h1string, re.IGNORECASE) != None): irc.reply("No data found regarding " + word) else: cve = [] href = [] ret = '' for line in fonttable: string = str(line) cve.append(re.sub('^.*">|</font>|\\n', '', string)) for line in hreftable: string = str(line) splitstring = string.split('>') #print splitstring href.append(re.sub('^.*="|"', '', splitstring[0])) ret = "%s %s" % (cve[3], cve[4]) if category == 'keyword': for link in href: ret += cveroot + link + " " else: ret +=cve[8] irc.reply(ret)
def parseDream(htmlTxt): soup = BeautifulSoup() soup.feed(htmlTxt) tdMain = soup.first( "td", { "width": "437", "height": "1", "colspan": "3", "valign": "top", "rowspan": "2", "align": "left" }) if not tdMain: return (UNKNOWN_FORMAT, dUnknownFormatText) fList = tdMain.fetch("font", { "face": "Arial", "size": "4", "color": "#6500CA" }) definitionsCount = len(fList) - 3 if 0 >= definitionsCount: return (NO_RESULTS, dNoResultsText) outerList = [] realDefinitionsCount = 0 for fItem in fList[:-3]: bItem = fItem.first("b") if bItem: itemTitle = getAllTextFromTag(bItem).replace("\n", "").strip() itemText = getAllTextFromToInBrFormat( getLastElementFromTag(fItem).next, fList[realDefinitionsCount + 1]) itemText = itemText.replace("\n", "").strip() outerList.append((itemTitle, itemText)) realDefinitionsCount += 1 if 0 == realDefinitionsCount: return (UNKNOWN_FORMAT, dUnknownFormatText) return (DREAM_DATA, universalDataFormatReplaceEntities(outerList))
def areaCodeByCity(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) res = testNoResults(soup) if RESULTS_DATA != res: return (res, None) resultsTable = soup.first("table", {"summary": "Results Content"}) if resultsTable: strong = resultsTable.first("strong") if strong: if getAllTextFromTag(strong).startswith("Multiple cities with"): aList = resultsTable.fetch("a") for aItem in aList: city = getAllTextFromTag(aItem) returned.append(city) if len(returned) == 0: return (UNKNOWN_FORMAT, m411UnknownFormatText) return (MULTIPLE_SELECT, string.join(returned, "\n")) # results return reverseZIPCodeLookup(htmlTxt) tables = soup.fetch("table", {"summary": "Search Results"}) if 0 == len(tables): return (UNKNOWN_FORMAT, m411UnknownFormatText) trList = [] for tab in tables: trList += tab.fetch("tr") for tr in trList: tdList = tr.fetch("td") if len(tdList) == 3: code = getAllTextFromTag(tdList[0]).strip() country = getAllTextFromTag(tdList[1]).strip() timezone = getAllTextFromTag(tdList[2]).strip() if code != "New Search": smallList = (code, country, timezone) returned.append(smallList) if len(returned) == 0: return (UNKNOWN_FORMAT, m411UnknownFormatText) return (RESULTS_DATA, universalDataFormatReplaceEntities(returned))
def areaCodeByCity(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) res = testNoResults(soup) if RESULTS_DATA != res: return (res, None) resultsTable = soup.first("table", {"summary":"Results Content"}) if resultsTable: strong = resultsTable.first("strong") if strong: if getAllTextFromTag(strong).startswith("Multiple cities with"): aList = resultsTable.fetch("a") for aItem in aList: city = getAllTextFromTag(aItem) returned.append(city) if len(returned) == 0: return (UNKNOWN_FORMAT,m411UnknownFormatText) return (MULTIPLE_SELECT,string.join(returned,"\n")) # results return reverseZIPCodeLookup(htmlTxt) tables = soup.fetch("table", {"summary":"Search Results"}) if 0 == len(tables): return (UNKNOWN_FORMAT,m411UnknownFormatText) trList = [] for tab in tables: trList += tab.fetch("tr") for tr in trList: tdList = tr.fetch("td") if len(tdList) == 3: code = getAllTextFromTag(tdList[0]).strip() country = getAllTextFromTag(tdList[1]).strip() timezone = getAllTextFromTag(tdList[2]).strip() if code != "New Search": smallList = (code,country,timezone) returned.append(smallList) if len(returned) == 0: return (UNKNOWN_FORMAT,m411UnknownFormatText) return (RESULTS_DATA,universalDataFormatReplaceEntities(returned))
def _parseList(htmlTxt): soup = BeautifulSoup() soup.feed(htmlTxt) trList = soup.fetch("tr") outerList = [] for tr in trList: if len(tr.fetch("tr")) == 0: tdList = tr.fetch("td") if len(tdList) == 4: if tdList[0].first("span", {"class": "title"}): rank = getAllTextFromTag(tdList[0]) title = getAllTextFromTag(tdList[1]) rating = getAllTextFromTag(tdList[2]) explicitness = getAllTextFromTag(tdList[3]) aItem = tdList[1].first("a") if aItem: url = aItem["href"] outerList.append((rank, title, rating, explicitness, url)) if 0 == len(outerList): return (NO_RESULTS, jNoResultsText) return (JOKES_LIST, universalDataFormatReplaceEntities(outerList))
def tryParseSearchDefinition(htmlTxt, fArtistSearch, modulesInfo, keywords): soup = BeautifulSoup() soup.feed(htmlTxt) # no results input = soup.first("input", {"name": "albumName"}) if input: return NO_RESULTS, None # get td's headerList = soup.fetch("td", {"class": "tb_header"}) tdList = soup.fetch("td", {"class": "tb_row_r2"}) if len(headerList) == 0 or len(tdList) == 0: return UNKNOWN_FORMAT, None # test modulo offset headersCount = len(headerList) if (len(tdList) % headersCount) != 0: return UNKNOWN_FORMAT, None searchResults = [] # get results for index in range(len(tdList) - 1): artist = getAllTextFromTag(tdList[index]).strip() title = getAllTextFromTag(tdList[index + 1]).strip() urlStart = "show.php?id=" aItem = tdList[index + 1].first("a", {"href": urlStart + "%"}) if aItem: lyricsId = aItem['href'][len(urlStart):] searchResults.append([artist, title, lyricsId]) if 0 == len(searchResults): return (UNKNOWN_FORMAT, None) if fArtistSearch: df = searchResultsToDefinitionThree(searchResults, modulesInfo) else: #df = searchResultsToDefinition(searchResults, modulesInfo) df = searchResultsToDefinitionTwo(searchResults, modulesInfo) return LYRICS_SEARCH, universalDataFormatWithDefinition( df, [["H", "Search: " + keywords]])
def define(self, irc, msg, args): """[word] look up the word in wordnet""" if len(args) != 1: irc.reply("you gotta give me a word to define") return word = self._prepare_term(args[0], "") url = 'http://wordnet.princeton.edu/perl/webwn?s=' + word html = urllib2.urlopen(url).read() soup = BeautifulSoup() soup.feed(html) maintable = soup.fetch('li') retdef = [] checkfordefs = len(maintable) if checkfordefs != 0: for lines in maintable: converttostring = str(lines) definition = re.sub('^.*\(|\).*$', '', converttostring) retdef.append(definition) else: retdef.append("not found. Is %s spelled corectly?" % word) irc.reply(word + ": " + "; ".join(retdef))