Python BeautifulSoup.feed примеры, BeautifulSoup.BeautifulSoup.feed Python примеры использования

Пример #1

0

Показать файл

Файл: currency_ny_frb.py Проект: kjk/moriarty-palm

def parseCurrencyData(htmlText):
    global _g_regionsToISO
    soup = BeautifulSoup()
    soup.feed(htmlText)
    tables = soup("table", {"border": "0", "width": "368", "cellpadding": "0", "cellspacing": "0"})
    rows = tables[0].fetch("tr")
    currencies = dict()
    isFirstRow = True
    for row in rows:
        cells = row.fetch("td")
        if 4 != len(cells):
            continue
        if isFirstRow:
            isFirstRow = False
            continue
        region = retrieveContents(cells[1].fetch("div")[0].contents[0])
        try:
            rate = float(retrieveContents(cells[3].fetch("div")[0].contents[0]))
            if "*" == retrieveContents(cells[0].contents[0]):
                rate = 1/rate
            abbrev = _g_regionsToISO[region]
            currencies[abbrev] = rate
        except ValueError:
            pass
    return (RESULTS_DATA, currencies)

Пример #2

0

Показать файл

Файл: m411_by411.py Проект: kjk/moriarty-palm

def internationalCodeSearch(htmlTxt):
    result = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    res = testNoResults(soup)
    if RESULTS_DATA != res:
        return (res, None)
    # Country code(s)
    tableList = soup.fetch("table", {"summary": "Codes Results"})
    if len(tableList) != 1:
        return (UNKNOWN_FORMAT, m411UnknownFormatText)
    # found country code
    tdListA = tableList[0].fetch("td", {"style": "%padding-left:5px;"})
    tdListB = tableList[0].fetch("td", {"style": "%line-height:14pt;"})
    if len(tdListA) != len(tdListB):
        return (UNKNOWN_FORMAT, m411UnknownFormatText)
    cityCodeList = []
    for i in range(len(tdListA)):
        if 0 == i:
            result.append([getAllTextFromTag(tdListB[i])])
        else:
            city = getAllTextFromTag(tdListA[i])
            code = getAllTextFromTag(tdListB[i])
            cityCodeList.append((city, code))
    # sort the (city,code) list by city
    cityCodeList.sort(sortByCityFunc)
    for el in cityCodeList:
        result.append(el)
    return (RESULTS_DATA, universalDataFormatReplaceEntities(result))

Пример #3

0

Показать файл

Файл: currency_evocash.py Проект: AglaianWoman/moriarty-palm

def parseCurrency(htmlTxt):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    #TABLE WIDTH="100%" BORDER="0" CELLPADDING="0" CELLSPACING="0" BGCOLOR="#009900"
    #<TABLE WIDTH="100%" BORDER="0" CELLPADDING="1" CELLSPACING="1" BGCOLOR="#000000">
    findTable = soup.fetch(
        "table", {
            "width": "100%",
            "border": "0",
            "cellpadding": "1",
            "cellspacing": "1",
            "bgcolor": "#000000"
        })
    #print findTable
    if not findTable:
        return (UNKNOWN_FORMAT, currencyNoResultsText)
    itemTable = findTable[0]
    findTableTR = itemTable.fetch("tr")
    #Parse page and create dictionary
    for itemTR in findTableTR:
        findTD = itemTR.fetch("td")
        if 0 == len(findTD):
            continue
        if 4 != len(findTD):
            return (UNKNOWN_FORMAT, currencyNoResultsText)
        #print str(findTD[1].contents[0].contents[0].contents[0])
        #print str(findTD[2].contents[0].contents[0]).replace(",","").strip()
        abbrev = str(findTD[1].contents[0].contents[0].contents[0])
        g_AbbrevToRatesDict[abbrev] = float(
            str(findTD[2].contents[0].contents[0]).replace(",", "").strip())
    g_AbbrevToRatesDict["USD"] = 1.0
    return (RESULTS_DATA, g_AbbrevToRatesDict)

Пример #4

0

Показать файл

Файл: parserUtils.py Проект: kjk/moriarty-palm

def getTextFromDirtyText(dirtyText):
    soup = BeautifulSoup()
    soup.feed("<xxx>" + dirtyText + "</xxx><yyy>test</yyy>")
    dirtySoup = soup.first("xxx")
    textWithBr = getAllTextFromToInBrFormat(dirtySoup, getLastElementFromTag(dirtySoup).next)
    text = textWithBr.replace("<br>", "\n").replace("<b>", "").replace("</b>", "")
    return text

Пример #5

0

Показать файл

Файл: weather.py Проект: kjk/moriarty-palm

def parseFirstDayHtml(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    # sky, now, feelsLike, wind, hum, preasure, dev point, visibility
    returned = [None, None, None, None, None, None, None, "N/A"]

    bItems = soup.fetch("b", {"class":"obsTextA"})
    if len(bItems) == 2:
        returned[0] = getAllTextFromTag(bItems[0]).strip()
        temp = getAllTextFromTag(bItems[1]).strip().split("Like ")
        if len(temp) > 1:
            returned[2] = temp[1].replace("&deg;F","").strip()

    bItem = soup.first("b", {"class":"obsTempTextA"})
    if bItem:
        returned[1] = getAllTextFromTag(bItem).replace("&deg;F","").strip()

    tdList = soup.fetch("td", {"class":"obsTextA"})
    if len(tdList) == 8:
        tdList = tdList[1::2]
        assert (len(tdList) == 4)
        returned[3] = getAllTextFromTag(tdList[0]).strip()
        returned[4] = getAllTextFromTag(tdList[1]).replace("%","").strip()
        returned[5] = getAllTextFromTag(tdList[2]).replace("in.","inches").strip() ##todo: down, up, ...
        returned[6] = getAllTextFromTag(tdList[3]).replace("&deg;F","").strip()

    for r in returned:
        if r == None or r == "":
            return None
    return returned

Пример #6

0

Показать файл

Файл: m411_by411.py Проект: kjk/moriarty-palm

def reverseZIPCodeLookup(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    res = testNoResults(soup)
    if RESULTS_DATA != res:
        return (res, None)
    # results (one? we handle more than one)
    tables = soup.fetch("table", {"summary": "Codes Results"})
    if 0 == len(tables):
        return (UNKNOWN_FORMAT, m411UnknownFormatText)
    trList = []
    for tab in tables:
        trList += tab.fetch("tr")
    for tr in trList:
        tdList = tr.fetch("td")
        if len(tdList) == 3:
            city = getAllTextFromTag(tdList[0])
            country = getAllTextFromTag(tdList[1])
            timezone = getAllTextFromTag(tdList[2])
            if city != "New Search":
                smallList = (city, country, timezone)
                returned.append(smallList)
        elif len(tdList) == 2:  # special case (911)
            city = getAllTextFromTag(tdList[0])
            country = getAllTextFromTag(tdList[1])
            if city != "New Search":
                smallList = (city, country, "")
                returned.append(smallList)
    if len(returned) == 0:
        return (UNKNOWN_FORMAT, m411UnknownFormatText)
    return (RESULTS_DATA, universalDataFormatReplaceEntities(returned))

Пример #7

0

Показать файл

def parseFirstDayHtmlYahoo(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    # sky, now, feelsLike, wind, hum, preasure, dev point, visibility
    returned = [None, None, None, None, None, None, None, None]

    fontList = soup.fetch("font", {"face": "Arial", "size": "2"})
    list = []
    wasFeelsLike = False
    for f in fontList:
        text = getAllTextFromTag(f).strip()
        if wasFeelsLike:
            list.append(text)
        else:
            if text == "Feels Like:":
                list.append(text)
                wasFeelsLike = True
    if len(list) >= 16:
        smallList = list[1::2]
        returned[0] = ""
        returned[1] = smallList[0].replace("&deg;", "")
        returned[2] = smallList[0].replace("&deg;", "")
        returned[3] = smallList[3]
        returned[4] = smallList[4].replace("%", "")
        returned[5] = smallList[2]
        returned[6] = smallList[1].replace("&deg;", "")
        returned[7] = smallList[6]

    for r in returned:
        if r == None:
            return None
    return returned

Пример #8

0

Показать файл

Файл: gasprices.py Проект: kjk/moriarty-palm

def parseGasOld(htmlTxt, url=None, dbgLevel=0):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    testTitle = soup.first("title")
    if testTitle:
        if getAllTextFromTag(testTitle).startswith("GasBuddy.com - Find cheap gas prices in your city"):
            return (LOCATION_UNKNOWN, gLocationUnknownText)

    outerList = []
    trList = soup.fetch("tr")
    for trItem in trList:
        tdList = trItem.fetch("td")
        if 8 == len(tdList):
            if tdList[1].first("table"):
                price = getAllTextFromTag(tdList[0]).strip()
                name = getAllTextFromTag(tdList[2]).strip()
                address = getAllTextFromTag(tdList[4]).strip()
                area = getAllTextFromTag(tdList[5]).strip()
                time = getAllTextFromTag(tdList[6]).strip()
                smallList = [price, name, address, area, time]
                outerList.append(smallList)
        else:
            if 0 != len(tdList):
                firstB = tdList[0].first("b")
                if firstB:
                    if getAllTextFromTag(firstB).startswith("No gas prices found."):
                        return (NO_RESULTS, gNoResultsText)

    if 0 == len(outerList):
        if dbgLevel > 0:
            print "len(outerList)==0"
        return parsingFailed(url, htmlTxt)

    return (GAS_DATA, universalDataFormatReplaceEntities(outerList))

Пример #9

0

Показать файл

def reverseZIPCodeLookup(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    res = testNoResults(soup)
    if RESULTS_DATA != res:
        return (res, None)
    # results (one? we handle more than one)
    tables = soup.fetch("table", {"summary":"Codes Results"})
    if 0 == len(tables):
        return (UNKNOWN_FORMAT,m411UnknownFormatText)
    trList = []
    for tab in tables:
        trList += tab.fetch("tr")
    for tr in trList:
        tdList = tr.fetch("td")
        if len(tdList) == 3:
            city     = getAllTextFromTag(tdList[0])
            country  = getAllTextFromTag(tdList[1])
            timezone = getAllTextFromTag(tdList[2])
            if city != "New Search":
                smallList = (city,country,timezone)
                returned.append(smallList)
        elif len(tdList) == 2: #special case (911)
            city     = getAllTextFromTag(tdList[0])
            country  = getAllTextFromTag(tdList[1])
            if city != "New Search":
                smallList = (city,country,"")
                returned.append(smallList)
    if len(returned) == 0:
        return (UNKNOWN_FORMAT,m411UnknownFormatText)
    return (RESULTS_DATA,universalDataFormatReplaceEntities(returned))

Пример #10

0

Показать файл

Файл: stocks.py Проект: AglaianWoman/moriarty-palm

def parseName(htmlTxt):
    # this is funy
    htmlTxt = htmlTxt.replace("<! -- ", "<!---")
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    # no results
    fontList = soup.fetch("font", {"face": "arial"})
    for fontItem in fontList:
        iItem = fontItem.first("i")
        if iItem:
            if str(iItem.contents[0]).startswith("Your search for"):
                return (NO_RESULTS, sNoResultsText)

    # get table data
    trList = soup.fetch("tr", {"bgcolor": "#ffffff"})
    resultsCount = 0
    outerList = []
    for trItem in trList:
        tdList = trItem.fetch("td")
        if 5 == len(tdList):
            symbol = getAllTextFromTag(tdList[0]).strip()
            url = tdList[0].first("a")['href']
            name = getAllTextFromTag(tdList[1]).strip()
            market = getAllTextFromTag(tdList[2]).strip()
            industry = getAllTextFromTag(tdList[3]).strip()
            outerList.append((url, symbol, name, market, industry))
            resultsCount += 1

    # no results?
    if 0 == resultsCount:
        return (NO_RESULTS, sNoResultsText)

    return (STOCKS_LIST, universalDataFormatReplaceEntities(outerList))

Пример #11

0

Показать файл

def tryParseDetails(htmlTxt, updateString):
    htmlTxt = removeCloseTagAttr(htmlTxt)
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    flightInfo = []
    table = soup.first("table", {"name": "flight_info"})
    trList = []
    if table:
        for tr in table.fetch("tr"):
            if len(tr.fetch("td")) == 4:
                trList.append(tr)
            elif len(tr.fetch("td")) == 1:
                img = tr.first("img", {"alt": "Continuing on To"})
                if img:
                    trList.append(tr)
    for tr in trList:
        tdList = tr.fetch("td")
        if len(tdList) == 4:
            info = getAllTextFromTag(tdList[0]).replace("&nbsp;", " ").strip()
            infoFrom = getAllTextFromTag(tdList[1]).replace("&nbsp;",
                                                            " ").strip()
            infoTo = getAllTextFromTag(tdList[3]).replace("&nbsp;",
                                                          " ").strip()
            if info != "":
                flightInfo.append([info, infoFrom, infoTo])
        else:
            flightInfo.append([""])

    flight = ""
    table = soup.first("table", {"name": "headbar2"})
    if table:
        bItem = table.first("b")
        if bItem:
            flight = getAllTextFromTag(bItem)

    if 0 == len(flightInfo) or "" == flight:
        return UNKNOWN_FORMAT, None
    # definition
    df = Definition()
    df.TextElement(flight, style=styleNameBold)
    df.LineBreakElement(1, 2)
    index = 0
    for item in flightInfo:
        # info, from, to
        if len(item) == 3:
            df.TextElement(item[0], style=styleNameHeader)
            if item[1] != "":
                df.LineBreakElement()
                df.TextElement(item[1])
            if item[2] != "":
                gtxt = df.TextElement(item[2])
                gtxt.setJustification(justRight)
            else:
                df.LineBreakElement()
        else:
            df.HorizontalLineElement()

    return RESULTS_DATA, universalDataFormatWithDefinition(
        df, [["U", updateString]])

Пример #12

0

Показать файл

Файл: spider_manybooks.py Проект: AglaianWoman/moriarty-palm

def _spider_book_info(url, letter):
    try:
        html = getHttp(url, handleException = False)
        soup = BeautifulSoup()
        soup.feed(html)
        h1 = soup.first("h1")
        if h1 is None:
            return None

        assert h1 is not None
        title = retrieveContents(h1).decode("iso-8859-1")

        subtitle = None
        author = None
        code = None

        labels = [retrieveContents(x) for x in soup.fetch("span", {"class": "title-label"})]
        data = soup.fetch("span", {"class": "title-data"})
        try:
            index = labels.index("Subtitle")
            subtitle = retrieveContents(data[index]).decode("iso-8859-1")
        except ValueError:
            pass

        try:
            index = labels.index("Author")
            author = retrieveContents(data[index].first("a")).decode("iso-8859-1")
        except ValueError:
            pass

        try:
            index = labels.index("Language")
            href = str(data[index].first("a", {"href": "/language.php?code=%"})["href"])
            code = href[19:href.find("&", 19)].decode("iso-8859-1")
        except ValueError:
            pass

        tid = soup.first("input", {"type": "hidden", "name": "tid"})
        assert tid is not None
        book_id = tid["value"].decode("iso-8859-1")

        print (u"%s: \"%s\"" % (author, title)).encode("iso-8859-1", "ignore")

        sel = soup.first("select", {"name": "book"})
        assert sel is not None
        opts = sel.fetch("option")
        formats = []
        for opt in opts:
            try:
                format = retrieveContents(opt).split()[0]
                if format not in ebooks.FORMATS:
                    continue

                val = opt["value"]
                formats.append((format, val))

            except Exception, ex:
                log(SEV_EXC, exceptionAsStr(ex))
        formats.sort()
        return (url, title, subtitle, author, book_id, code, formats)

Пример #13

0

Показать файл

def parseCurrencyData(htmlText):
    global _g_regionsToISO
    soup = BeautifulSoup()
    soup.feed(htmlText)
    tables = soup("table", {
        "border": "0",
        "width": "368",
        "cellpadding": "0",
        "cellspacing": "0"
    })
    rows = tables[0].fetch("tr")
    currencies = dict()
    isFirstRow = True
    for row in rows:
        cells = row.fetch("td")
        if 4 != len(cells):
            continue
        if isFirstRow:
            isFirstRow = False
            continue
        region = retrieveContents(cells[1].fetch("div")[0].contents[0])
        try:
            rate = float(retrieveContents(
                cells[3].fetch("div")[0].contents[0]))
            if "*" == retrieveContents(cells[0].contents[0]):
                rate = 1 / rate
            abbrev = _g_regionsToISO[region]
            currencies[abbrev] = rate
        except ValueError:
            pass
    return (RESULTS_DATA, currencies)

Пример #14

0

Показать файл

Файл: jokes.py Проект: kjk/moriarty-palm

def parseList(htmlTxt):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    # find table with results
    tableList = soup.fetch("table", {"cellpadding": "0", "cellspacing": "0", "border": "0", "width": "100%"})
    if 0 == len(tableList):
        return (UNKNOWN_FORMAT, jUnknownFormatText)

    outerList = []
    for table in tableList:
        trList = table.fetch("tr")
        if 2 <= len(trList):
            tdCount = len(trList[0].fetch("td"))
            if 3 > tdCount:
                return (UNKNOWN_FORMAT, jUnknownFormatText)
            for tr in trList[1:]:
                tdList = tr.fetch("td")
                rank = ""
                if 4 == tdCount:
                    rank = getAllTextFromTag(tdList[0])
                title = getAllTextFromTag(tdList[-3])
                rating = getAllTextFromTag(tdList[-2])
                explicitness = getAllTextFromTag(tdList[-1])
                url = tdList[-3].first("a")["href"]
                if not url:
                    return (UNKNOWN_FORMAT, jUnknownFormatText)
                outerList.append((rank, title, rating, explicitness, url))

    if 0 == len(outerList):
        return (NO_RESULTS, jNoResultsText)

    return (JOKES_LIST, universalDataFormatReplaceEntities(outerList))

Пример #15

0

Показать файл

Файл: currency_moneyextra.py Проект: kjk/moriarty-palm

def parseCurrencyData(htmlText):
    global _g_imgRe
    soup = BeautifulSoup()
    soup.feed(htmlText)
    # <table width="60%" border="0" cellpadding="3" summary="Displays latest tourist currency rates">
    table = soup.first("table", {"border": "0", "width": "60%", "cellpadding": "3", "summary": "Displays latest tourist currency rates"})
    assert table is not None
    tbody = table.first("tbody")
    assert tbody is not None
    rows = tbody.fetch("tr")
    currencies = dict()
    for row in rows:
        cells = row.fetch("td")
        img = cells[0].fetch("img")[0]
        match = _g_imgRe.match(img["src"])
        if match is None:
            continue
        abbrev = match.group(1)
        rate = float(str(cells[2].contents[0]).strip().split()[0])
        currencies[abbrev] = rate
    usdRate = currencies["USD"]
    for key in currencies.iterkeys():
        currencies[key] = currencies[key] / usdRate
    assert 1 == currencies["USD"]
    return (RESULTS_DATA, currencies)

Пример #16

0

Показать файл

def reverseAreaCodeLookup(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    noResults = m411testNoResults(soup)
    if NO_RESULTS == noResults:
        return (noResults,m411NoResultsText)
    # results
    tableList = soup.fetch("table", {"id":"listings"})
    if len(tableList) != 1:
        return UNKNOWN_FORMAT, None

    trList = tableList[0].fetch("tr")
    if len(trList) == 0:
        return UNKNOWN_FORMAT, None
    # ignore headers ([1:])
    for trItem in trList[1:]:
        if 0 == len(trItem.fetch("tr")): # they have sth screwed with this <tr><tr> .... </tr></tr>
            tdList = trItem.fetch("td", {"id":"subtextid"})
            if 3 == len(tdList):
                city     = getAllTextFromTag(tdList[0])
                country  = getAllTextFromTag(tdList[1])
                timezone = getAllTextFromTag(tdList[2])
                smallList = (city,country,timezone)
                returned.append(smallList)
            elif 2 == len(tdList):
                city     = getAllTextFromTag(tdList[0])
                country  = ""
                timezone = getAllTextFromTag(tdList[1])
                smallList = (city,country,timezone)
                returned.append(smallList)

    if 0 == len(returned):
        return (UNKNOWN_FORMAT,m411UnknownFormatText)
    return (RESULTS_DATA,universalDataFormatReplaceEntities(returned))

Пример #17

0

Показать файл

def reversePhoneLookupWhitepages(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    noResults = m411testNoResultsReversePhoneLookup(soup)
    if NO_RESULTS == noResults:
        return (noResults,m411NoResultsText)
    div = soup.first("div", {"class":"listings"})
    if div:
        for table in div.fetch("table"):
            for tr in table.fetch("tr"):
                text1 = tr.first("div",{"class":"textb"})
                text2 = tr.first("div",{"class":"text"})
                if text1 and text2:
                    name = getAllTextFromTag(text1)
                    cont = getAllTextFromToInBrFormat(text2, getLastElementFromTag(text2).next)
                    parts = cont.split("<br>")
                    (address,city,phone) = ("","","")
                    if len(parts) == 3:
                        (address,city,phone) = parts
                    if len(parts) == 2:
                        (city,phone) = parts
                    if len(parts) == 4:
                        (prefix,address,city,phone) = parts
                    returned.append((name,address.strip(),city.strip(),phone.strip()))
    if len(returned) == 0:
        return UNKNOWN_FORMAT, None
    return (RESULTS_DATA,universalDataFormatReplaceEntities(returned))

Пример #18

0

Показать файл

Файл: cal2mail.py Проект: bgnori/tonic

 def mailsubject(self):
   p = Parser()
   p.feed(self._imp)
   p.goahead(0)
   div = p.find('div', attrs={'class':'moji'})
   #FIXME assuming BeautifulSoup uses utf8
   return str(div.contents[0]).decode('utf8')

Пример #19

0

Показать файл

def internationalCodeSearch(htmlTxt):
    result = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    noResults = m411testNoResults(soup)
    if NO_RESULTS == noResults:
        return (noResults,m411NoResultsText)
    # results
    tableList = soup.fetch("table",{"id":"listings"})
    if len(tableList) != 1:
        return UNKNOWN_FORMAT, None
    trList = tableList[0].fetch("tr")
    cityCodeList = []
    for trItem in trList:
        if 0 == len(trItem.fetch("tr")):
            tdList = trItem.fetch("td", {"id":"subtextid"})
            if 2 == len(tdList):
                if 0 == len(result):
                    result.append([getAllTextFromTag(tdList[1])])
                else:
                    city = getAllTextFromTag(tdList[0])
                    code = getAllTextFromTag(tdList[1])
                    cityCodeList.append((city,code))
    # sort the (city,code) list by city
    cityCodeList.sort(sortByCityFunc)
    for el in cityCodeList:
        result.append(el)
    if 0 == len(result):
        return UNKNOWN_FORMAT, None
    return (RESULTS_DATA,universalDataFormatReplaceEntities(result))

Пример #20

0

Показать файл

def internationalCodeSearch(htmlTxt):
    result = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    res = testNoResults(soup)
    if RESULTS_DATA != res:
        return (res, None)
    # Country code(s)
    tableList = soup.fetch("table", {"summary":"Codes Results"})
    if len(tableList) != 1:
        return (UNKNOWN_FORMAT,m411UnknownFormatText)
    # found country code
    tdListA = tableList[0].fetch("td",{"style":"%padding-left:5px;"})
    tdListB = tableList[0].fetch("td",{"style":"%line-height:14pt;"})
    if len(tdListA) != len(tdListB):
        return (UNKNOWN_FORMAT,m411UnknownFormatText)
    cityCodeList = []
    for i in range(len(tdListA)):
        if 0 == i:
            result.append([getAllTextFromTag(tdListB[i])])
        else:
            city = getAllTextFromTag(tdListA[i])
            code = getAllTextFromTag(tdListB[i])
            cityCodeList.append((city,code))
    # sort the (city,code) list by city
    cityCodeList.sort(sortByCityFunc)
    for el in cityCodeList:
        result.append(el)
    return (RESULTS_DATA,universalDataFormatReplaceEntities(result))

Пример #21

0

Показать файл

def parseMultiselect(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    aList = soup.fetch("a", {"href": "/weather/local/%"})
    aList += soup.fetch("a", {"href": "/outlook/travel/local/%"})
    aList += soup.fetch("a",
                        {"href": "/outlook/travel/businesstraveler/local/%"})

    lastCode = ""
    resultsCount = 0
    for aItem in aList:
        afterLocal = aItem['href'].split("local/")
        if 2 == len(afterLocal):
            textAfterLocal = afterLocal[1]
            if 8 < len(textAfterLocal):
                code = textAfterLocal[:8]
                textAfterLocal = textAfterLocal[8:]
                if textAfterLocal.startswith("?from=search_"):
                    if -1 == lastCode.find(code):
                        lastCode += code
                        text = getAllTextFromTag(aItem)
                        resultsCount += 1
                        returned.append((text, code))
    if 0 == resultsCount:
        return (LOCATION_UNKNOWN, None)
    return (LOCATION_MULTISELECT, universalDataFormatReplaceEntities(returned))

Пример #22

0

Показать файл

Файл: weather.py Проект: kjk/moriarty-palm

def parseFirstDayHtmlYahoo(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    # sky, now, feelsLike, wind, hum, preasure, dev point, visibility
    returned = [None, None, None, None, None, None, None, None]

    fontList = soup.fetch("font",{"face":"Arial", "size":"2"})
    list = []
    wasFeelsLike = False
    for f in fontList:
        text = getAllTextFromTag(f).strip()
        if wasFeelsLike:
            list.append(text)
        else:
            if text == "Feels Like:":
                list.append(text)
                wasFeelsLike = True
    if len(list) >= 16:
        smallList = list[1::2]
        returned[0] = ""
        returned[1] = smallList[0].replace("&deg;","")
        returned[2] = smallList[0].replace("&deg;","")
        returned[3] = smallList[3]
        returned[4] = smallList[4].replace("%","")
        returned[5] = smallList[2]
        returned[6] = smallList[1].replace("&deg;","")
        returned[7] = smallList[6]

    for r in returned:
        if r == None:
            return None
    return returned

Пример #23

0

Показать файл

def parseCurrencyData(htmlText):
    global _g_imgRe
    soup = BeautifulSoup()
    soup.feed(htmlText)
    # <table width="60%" border="0" cellpadding="3" summary="Displays latest tourist currency rates">
    table = soup.first(
        "table", {
            "border": "0",
            "width": "60%",
            "cellpadding": "3",
            "summary": "Displays latest tourist currency rates"
        })
    assert table is not None
    tbody = table.first("tbody")
    assert tbody is not None
    rows = tbody.fetch("tr")
    currencies = dict()
    for row in rows:
        cells = row.fetch("td")
        img = cells[0].fetch("img")[0]
        match = _g_imgRe.match(img["src"])
        if match is None:
            continue
        abbrev = match.group(1)
        rate = float(str(cells[2].contents[0]).strip().split()[0])
        currencies[abbrev] = rate
    usdRate = currencies["USD"]
    for key in currencies.iterkeys():
        currencies[key] = currencies[key] / usdRate
    assert 1 == currencies["USD"]
    return (RESULTS_DATA, currencies)

Пример #24

0

Показать файл

Файл: weather.py Проект: kjk/moriarty-palm

def parseMultiselect(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    aList = soup.fetch("a", {"href":"/weather/local/%"})
    aList += soup.fetch("a", {"href":"/outlook/travel/local/%"})
    aList += soup.fetch("a", {"href":"/outlook/travel/businesstraveler/local/%"})


    lastCode = ""
    resultsCount = 0
    for aItem in aList:
        afterLocal = aItem['href'].split("local/")
        if 2 == len(afterLocal):
            textAfterLocal = afterLocal[1]
            if 8 < len(textAfterLocal):
                code = textAfterLocal[:8]
                textAfterLocal = textAfterLocal[8:]
                if textAfterLocal.startswith("?from=search_"):
                    if -1 == lastCode.find(code):
                        lastCode += code
                        text = getAllTextFromTag(aItem)
                        resultsCount += 1
                        returned.append((text,code))
    if 0 == resultsCount:
        return (LOCATION_UNKNOWN,None)
    return (LOCATION_MULTISELECT,universalDataFormatReplaceEntities(returned))

Пример #25

0

Показать файл

def parseFirstDayHtml(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    # sky, now, feelsLike, wind, hum, preasure, dev point, visibility
    returned = [None, None, None, None, None, None, None, "N/A"]

    bItems = soup.fetch("b", {"class": "obsTextA"})
    if len(bItems) == 2:
        returned[0] = getAllTextFromTag(bItems[0]).strip()
        temp = getAllTextFromTag(bItems[1]).strip().split("Like ")
        if len(temp) > 1:
            returned[2] = temp[1].replace("&deg;F", "").strip()

    bItem = soup.first("b", {"class": "obsTempTextA"})
    if bItem:
        returned[1] = getAllTextFromTag(bItem).replace("&deg;F", "").strip()

    tdList = soup.fetch("td", {"class": "obsTextA"})
    if len(tdList) == 8:
        tdList = tdList[1::2]
        assert (len(tdList) == 4)
        returned[3] = getAllTextFromTag(tdList[0]).strip()
        returned[4] = getAllTextFromTag(tdList[1]).replace("%", "").strip()
        returned[5] = getAllTextFromTag(tdList[2]).replace(
            "in.", "inches").strip()  ##todo: down, up, ...
        returned[6] = getAllTextFromTag(tdList[3]).replace("&deg;F",
                                                           "").strip()

    for r in returned:
        if r == None or r == "":
            return None
    return returned

Пример #26

0

Показать файл

Файл: cal2mail.py Проект: bgnori/tonic

 def mailbody(self):
   p = Parser()
   p.feed(self._imp)
   p.goahead(0)
   td = p.find('td', attrs={'class':'moji'})
   #FIXME assuming BeautifulSoup uses utf8
   u = str(td).decode('utf8')
   return tagStrip.sub(replproc, u)

Пример #27

0

Показать файл

Файл: dreams.py Проект: AglaianWoman/moriarty-palm

def parseDream2(htmlTxt):
    soup = BeautifulSoup()
    # TODO: this is temporary:
    htmlTxt = htmlTxt.replace(
        "/*<![CDATA[*/ @import \"/knowledge/stylesheets/monobook/main.css\"; /*]]>*/",
        "")

    soup.feed(htmlTxt)

    tableMain = soup.fetch("table", {
        "width": "768",
        "align": "center",
        "cellspacing": "0",
        "cellpadding": "0"
    })
    if not tableMain:
        return (UNKNOWN_FORMAT, dUnknownFormatText)
    td = None
    for table in tableMain:
        tr = table.first("tr")
        if tr:
            tdTest = tr.first("td", {"width": "100%", "valign": "top"})
            if tdTest:
                td = tdTest
    if not td:
        return (UNKNOWN_FORMAT, dUnknownFormatText)
    # why without this it is not working?
    soup2 = BeautifulSoup()
    soup2.feed(str(td).replace("<br />>", ""))
    td = soup2.first("td")
    # no results?
    if td.first("center"):
        return (NO_RESULTS, dNoResultsText)

    # results
    bTable = td.fetch("b")
    if not bTable:
        return (UNKNOWN_FORMAT, dUnknownFormatText)

    outerList = []
    for bItem in bTable:
        title = getAllTextFromTag(bItem)
        next = getLastElementFromTag(bItem)
        pItem = None
        while next and not pItem:
            if isinstance(next, Tag):
                if next.name == "p":
                    pItem = next
            next = next.next
        if pItem:
            text = getAllTextFromTagWithA(pItem.first("font"))
            if text.startswith("Interpretation: "):
                text = text[len("Interpretation: "):]
            outerList.append((title, text))

    if 0 == len(outerList):
        return (NO_RESULTS, dNoResultsText)
    return (DREAM_DATA, universalDataFormatReplaceEntities(outerList))

Пример #28

0

Показать файл

Файл: flights.py Проект: kjk/moriarty-palm

def tryParseDetails(htmlTxt, updateString):
    htmlTxt = removeCloseTagAttr(htmlTxt)
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    flightInfo = []
    table = soup.first("table", {"name":"flight_info"})
    trList = []
    if table:
        for tr in table.fetch("tr"):
            if len(tr.fetch("td")) == 4:
                trList.append(tr)
            elif len(tr.fetch("td")) == 1:
                img = tr.first("img", {"alt":"Continuing on To"})
                if img:
                    trList.append(tr)
    for tr in trList:
        tdList = tr.fetch("td")
        if len(tdList)==4:
            info = getAllTextFromTag(tdList[0]).replace("&nbsp;"," ").strip()
            infoFrom = getAllTextFromTag(tdList[1]).replace("&nbsp;"," ").strip()
            infoTo = getAllTextFromTag(tdList[3]).replace("&nbsp;"," ").strip()
            if info != "":
                flightInfo.append([info, infoFrom, infoTo])
        else:
            flightInfo.append([""])

    flight = ""
    table = soup.first("table", {"name":"headbar2"})
    if table:
        bItem = table.first("b")
        if bItem:
            flight = getAllTextFromTag(bItem)

    if 0==len(flightInfo) or ""==flight:
        return UNKNOWN_FORMAT, None
    # definition
    df = Definition()
    df.TextElement(flight, style=styleNameBold)
    df.LineBreakElement(1,2)
    index = 0
    for item in flightInfo:
        # info, from, to
        if len(item) == 3:
            df.TextElement(item[0], style=styleNameHeader)
            if item[1] != "":
                df.LineBreakElement()
                df.TextElement(item[1])
            if item[2] != "":
                gtxt = df.TextElement(item[2])
                gtxt.setJustification(justRight)
            else:
                df.LineBreakElement()
        else:
            df.HorizontalLineElement()

    return RESULTS_DATA, universalDataFormatWithDefinition(df, [["U",updateString]])

Пример #29

0

Показать файл

def getTextFromDirtyText(dirtyText):
    soup = BeautifulSoup()
    soup.feed("<xxx>" + dirtyText + "</xxx><yyy>test</yyy>")
    dirtySoup = soup.first("xxx")
    textWithBr = getAllTextFromToInBrFormat(
        dirtySoup,
        getLastElementFromTag(dirtySoup).next)
    text = textWithBr.replace("<br>", "\n").replace("<b>",
                                                    "").replace("</b>", "")
    return text

Пример #30

0

Показать файл

def personSearch(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    noResults = m411testNoResults(soup)
    if NO_RESULTS == noResults:
        return (noResults,m411NoResultsText)
    ttlPref = soup.first("td",{"class":"TTLPREF"})
    if not ttlPref:
        ttlPref = soup.first("span",{"class":"TTLPREF"})
    if not ttlPref:
        return (UNKNOWN_FORMAT,m411UnknownFormatText)
    # too many results:
    font = ttlPref.first("font",{"color":"#FF0000"})
    if font:
        if "No results." == font.contents[0]:
            return (NO_RESULTS,m411NoResultsText)
        if "Results found in multiple cities." == font.contents[0]:
            brList = ttlPref.fetch("br")
            brList = brList[4:] ## skip text about select
            for br in brList:
                text = str(br.next).replace("<br />","").replace("\n","").strip()
                if len(text) > 0:
                    returned.append(text)
            return (MULTIPLE_SELECT, string.join(returned,"\n"))
        return (TOO_MANY_RESULTS,m411TooManyResults)
    # results:
    brList = ttlPref.fetch("br")
    resultsCount = len(brList) - 2
    if 0 == resultsCount:
        # no city?
        if "NO CITY FOUND" == str(brList[1].next).replace("\n","").strip():
            return (NO_CITY,m411NoCity)
    results = resultsCount/5
    if results*5 != resultsCount:    ## test if number of <br> is 5*n+2
        return (UNKNOWN_FORMAT,m411UnknownFormatText)
    # get them
    brList = brList[1:]  ## skip first br
    counter = 0
    smallList = []    
    for br in brList:
        text = str(br.next).replace("<br />","").replace("\n","").strip()
        if results > 0:
            if 0 == counter:
                smallList = [text]
            if 1 == counter or 2 == counter:
                smallList.append(text)
            if 3 == counter:
                smallList.append(text)
                returned.append(smallList)                
                results -= 1
        counter += 1
        if 5 == counter:
            counter = 0
    return (RESULTS_DATA,universalDataFormatReplaceEntities(returned))

Пример #31

0

Показать файл

def _parseRandomJoke(htmlTxt):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    table = soup.first("table", {"id": "jokeIframeTable2"})
    if not table:
        return UNKNOWN_FORMAT, None
    # title
    titleSpan = table.first("span", {"class": "jokeTitle_v2"})
    if not titleSpan:
        return UNKNOWN_FORMAT, None
    title = getAllTextFromTag(titleSpan)
    # text
    trList = table.fetch("tr")
    text = ""
    if len(trList) > 6:
        tdList = trList[5].fetch("td")
        if len(tdList) == 3:
            text = getAllTextFromToInBrFormat(tdList[1], tdList[2])
            if len(text.replace("&nbsp;", " ").strip()) < 2:
                text = ""
    if "" == text:
        return UNKNOWN_FORMAT, None
    smallList = [title, text]
    # rating
    table = soup.first("table", {"id": "Table5"})
    if table:
        td = table.first("td")
        if td:
            imgList = td.fetch("img", {"src": "%"})
            rating = "not rated"
            translator = {
                "iconrate_one": "1",
                "iconrate_two": "2",
                "iconrate_three": "3",
                "iconrate_four": "4",
                "iconrate_five": "5",
                "iconrate_one_half": "1.5",
                "iconrate_two_half": "2.5",
                "iconrate_three_half": "3.5",
                "iconrate_four_half": "4.5",
                "iconrate_zero_half": "0.5",
            }
            for img in imgList:
                src = img['src']
                src = src.split("/")[-1]
                src = src.replace(".gif", "")
                try:
                    rat = translator[src]
                    rating = rat
                except:
                    pass
            smallList.append(rating)
    outerList = [smallList]
    return (JOKE_DATA, universalDataFormatReplaceEntities(outerList))

Пример #32

0

Показать файл

Файл: jokes.py Проект: kjk/moriarty-palm

def _parseRandomJoke(htmlTxt):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    table = soup.first("table", {"id": "jokeIframeTable2"})
    if not table:
        return UNKNOWN_FORMAT, None
    # title
    titleSpan = table.first("span", {"class": "jokeTitle_v2"})
    if not titleSpan:
        return UNKNOWN_FORMAT, None
    title = getAllTextFromTag(titleSpan)
    # text
    trList = table.fetch("tr")
    text = ""
    if len(trList) > 6:
        tdList = trList[5].fetch("td")
        if len(tdList) == 3:
            text = getAllTextFromToInBrFormat(tdList[1], tdList[2])
            if len(text.replace("&nbsp;", " ").strip()) < 2:
                text = ""
    if "" == text:
        return UNKNOWN_FORMAT, None
    smallList = [title, text]
    # rating
    table = soup.first("table", {"id": "Table5"})
    if table:
        td = table.first("td")
        if td:
            imgList = td.fetch("img", {"src": "%"})
            rating = "not rated"
            translator = {
                "iconrate_one": "1",
                "iconrate_two": "2",
                "iconrate_three": "3",
                "iconrate_four": "4",
                "iconrate_five": "5",
                "iconrate_one_half": "1.5",
                "iconrate_two_half": "2.5",
                "iconrate_three_half": "3.5",
                "iconrate_four_half": "4.5",
                "iconrate_zero_half": "0.5",
            }
            for img in imgList:
                src = img["src"]
                src = src.split("/")[-1]
                src = src.replace(".gif", "")
                try:
                    rat = translator[src]
                    rating = rat
                except:
                    pass
            smallList.append(rating)
    outerList = [smallList]
    return (JOKE_DATA, universalDataFormatReplaceEntities(outerList))

Пример #33

0

Показать файл

def parse(wordtosearch):

    url = 'http://dictionary.reference.com/search?q=' + wordtosearch
    # Read the URL and pass it to BeautifulSoup.
    html = urllib2.urlopen(url).read()
    soup = BeautifulSoup()
    soup.feed(html)

    # Read the main table, extracting the words from the table cells.
    maintable = soup.fetch('li')

    # There are 6 lines containg <li> at the bottom that we don't want to print
    # So we remove them from the list by adjustin the count
    removeli = len(maintable) - 6

    counter = 0
    # if removeli is 0 then we need to look for dl tags
    if removeli == 0:
        # fetch dl tags
        maintable = soup.fetch('dl')
        for defs in maintable:
            converttostring = str(defs)
            splitstring = converttostring.split('<dd>')
            removetrash = re.sub(
                '^ |</dd.*dl>|<li.*">|<ol.*">|<cite> </li>|<cite>|</cite>|</ol></li>|</li>|<a.*/a>',
                '', splitstring[1])
            addunderscores = re.sub('<u><i>|</i></u>', '_', removetrash)
            convertampersands = re.sub('&', '&', addunderscores)
            definition = convertampersands
            print definition
    else:
        for counter in range(removeli):
            defs = maintable[counter]
            converttostring = str(defs)
            splitstring = converttostring.split('<li>')
            if len(splitstring) != 1:
                removetrash = re.sub(
                    '^ |(<li.*">|<ol.*">|<cite> </li>|<cite>|</cite>|</ol></li>|</li>|<a.*/a>)',
                    '', splitstring[1])
                addunderscores = re.sub('(<u><i>|</i></u>)', '_', removetrash)
                convertampersands = re.sub('&', '&', addunderscores)
                definition = convertampersands
                print definition

            else:
                removetrash = re.sub(
                    '^ |<li.*">|<ol.*">|<cite> </li>|<cite>|</cite>|</ol></li>|</li>|<a.*/a>',
                    '', splitstring[0])
                addunderscores = re.sub('<u><i>|</u></i>', '_', removetrash)
                convertampersands = re.sub('&', '&', addunderscores)
                definition = convertampersands
                print definition

            counter += 1

Пример #34

0

Показать файл

Файл: spider_manybooks.py Проект: AglaianWoman/moriarty-palm

    def _parse_letter_page(self, letter, html, index):
        self._check_finish()
        soup = BeautifulSoup()
        soup.feed(html)
        div = soup.first("div", {"class": "sidebar-module"})
        assert div is not None
        count = int(retrieveContents(div.contents[2]).split()[2])
        offset = 0
        self._lock.acquire()
        try:
            if count <= self._data[letter][0]:
                print 'Letter "%s" is up to date (%d records).' % (letter, self._data[letter][0])
                return True, count, 0
            offset = self._offsets[letter]
        finally:
            self._lock.release()

        spidered = 0
        div = soup.first("div", {"class": "titleList"})
        assert div is not None
        as = div.fetch("a")
        urls = []
        for a in as:
            url = _g_manybooks_url + urllib.quote(a["href"])
            urls.append(url)

        for url in urls:
            self._check_finish()
            i = -1
            self._lock.acquire()
            try:
                books = self._data[letter][1]
                i = _find_book_index(books, url, index)
            finally:
                self._lock.release()

            if -1 != i:
                index = i + 1
            else:
                book = _spider_book_info(url, letter)
                if book is not None:
                    spidered += 1
                    self._lock.acquire()
                    try:
                        self._fresh_books.append((letter, index + offset, book))
                        if len(self._fresh_books) == self.flush_after:
                            self._flush_books()
                        offset += 1
                        self._offsets[letter] = offset
                        if self._data[letter][0] + offset  == count:
                            return True, count, spidered
                    finally:
                        self._lock.release()
        return (index + offset == count), index, spidered

Пример #35

0

Показать файл

Файл: currency_exchangerate.py Проект: kjk/moriarty-palm

def parseCurrencyData(htmlText):
    global _g_metaRe
    htmlText = _g_metaRe.sub("", htmlText)
    soup = BeautifulSoup()
    soup.feed(htmlText)
    tables = soup("table", {"cellpadding": "2", "cellspacing": "0", "border": "0", "width": "468"})
    assert 1 == len(tables)
    rows = tables[0].fetch("tr", {"bgcolor": "#FFFFFF"})
    currencies = dict()
    for row in rows:
        fonts = row.fetch("font")
        currencies[str(fonts[2].contents[0])] = float(str(fonts[3].contents[0]))
    return (RESULTS_DATA, currencies)

Пример #36

0

Показать файл

def parseJoke(htmlTxt):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    table = soup.first("table", {"width": "328", "id": "Table2"})
    if not table:
        return (UNKNOWN_FORMAT, jUnknownFormatText)
    tdList = table.fetch("td", {
        "colspan": "3",
        "valign": "top",
        "class": "body"
    })
    if 3 != len(tdList):
        return (UNKNOWN_FORMAT, jUnknownFormatText)

    # simple format - simple parser
    title = getAllTextFromTag(tdList[0]).strip()
    text = getAllTextFromToInBrFormat(tdList[1], tdList[2].previous)
    smallList = [title, text]
    # add rating information
    if len(title) + len(
            text
    ) > 16:  # in random joke sometimes it returns small nothing... so to be sure
        span = soup.first("span", {"class": "body"})
        if span:
            text = getAllTextFromTag(span).replace("\n", "").strip()
            img = span.first("img", {"src": "%"})
            if text.startswith("CURRENT RATING") and img:
                src = img['src']
                src = src.split("/")[-1]
                src = src.replace(".gif", "")
                translator = {
                    "iconrate_one": "1",
                    "iconrate_two": "2",
                    "iconrate_three": "3",
                    "iconrate_four": "4",
                    "iconrate_five": "5",
                    "iconrate_one_half": "1.5",
                    "iconrate_two_half": "2.5",
                    "iconrate_three_half": "3.5",
                    "iconrate_four_half": "4.5",
                    "iconrate_zero_half": "0.5",
                }
                rating = "not rated"
                try:
                    rating = translator[src]
                except:
                    pass
                smallList.append(rating)
    outerList = [smallList]
    return (JOKE_DATA, universalDataFormatReplaceEntities(outerList))

Пример #37

0

Показать файл

Файл: stocks.py Проект: AglaianWoman/moriarty-palm

def parseStock(htmlTxt):
    # this is funy
    htmlTxt = htmlTxt.replace("<! -- ", "<!---")
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    noResults = testNoResults(soup)
    if NO_RESULTS == noResults:
        return (NO_RESULTS, sNoResultsText)

    # get name
    nameTag = soup.first("td", {"height": "30", "class": "ygtb"})
    if not nameTag:
        return (UNKNOWN_FORMAT, sUnknownFormatText)
    name = getAllTextFromTag(nameTag).strip()

    # get all data from table
    bigTable = soup.fetch("table", {"width": "580", "id": "yfncsumtab"})
    if 1 != len(bigTable):
        return (UNKNOWN_FORMAT, sUnknownFormatText)
    tdDataList = bigTable[0].fetch("td", {"class": "yfnc_tabledata1"})
    innerList = [name]
    counter = 0
    for tdItem in tdDataList:
        if 2 == counter:
            # 3th element is with up down icon
            imgItem = tdDataList[2].first("img")
            upDown = ""
            if imgItem:
                upDown = imgItem['alt']
            innerList.append(upDown)
            bItem = tdDataList[2].first("b")
            itemText = ""
            if bItem:
                itemText = getAllTextFromTag(bItem).strip()
            innerList.append(itemText)
        else:
            itemText = getAllTextFromTag(tdItem).strip()
            innerList.append(itemText)
        counter += 1

    # any results?
    if 0 == counter:
        return (UNKNOWN_FORMAT, sUnknownFormatText)

    # one-item UDF
    outerList = [innerList]
    return (STOCKS_DATA, universalDataFormatReplaceEntities(outerList))

Пример #38

0

Показать файл

Файл: d.py Проект: kg-bot/SupyBot

def parse(wordtosearch):

    url = 'http://dictionary.reference.com/search?q=' + wordtosearch
    # Read the URL and pass it to BeautifulSoup.
    html = urllib2.urlopen(url).read()
    soup = BeautifulSoup()
    soup.feed(html)

    # Read the main table, extracting the words from the table cells.
    maintable = soup.fetch('li')

    # There are 6 lines containg <li> at the bottom that we don't want to print
    # So we remove them from the list by adjustin the count
    removeli = len(maintable) - 6

    counter = 0
    # if removeli is 0 then we need to look for dl tags
    if removeli == 0:
        # fetch dl tags
        maintable = soup.fetch('dl')
        for defs in maintable:
            converttostring = str(defs)
            splitstring = converttostring.split('<dd>') 
            removetrash = re.sub('^ |</dd.*dl>|<li.*">|<ol.*">|<cite> </li>|<cite>|</cite>|</ol></li>|</li>|<a.*/a>', '', splitstring[1])
            addunderscores = re.sub('<u><i>|</i></u>', '_', removetrash)
            convertampersands = re.sub('&', '&', addunderscores)
            definition = convertampersands
            print definition
    else:    
        for counter in range(removeli):
            defs = maintable[counter]
            converttostring = str(defs)
            splitstring = converttostring.split('<li>')
            if len(splitstring) != 1:
                removetrash = re.sub('^ |(<li.*">|<ol.*">|<cite> </li>|<cite>|</cite>|</ol></li>|</li>|<a.*/a>)', '', splitstring[1])
                addunderscores = re.sub('(<u><i>|</i></u>)', '_', removetrash)
                convertampersands = re.sub('&', '&', addunderscores)
                definition = convertampersands
                print definition
                
            else:
                removetrash = re.sub('^ |<li.*">|<ol.*">|<cite> </li>|<cite>|</cite>|</ol></li>|</li>|<a.*/a>', '', splitstring[0])
                addunderscores = re.sub('<u><i>|</u></i>', '_', removetrash)
                convertampersands = re.sub('&', '&', addunderscores)
                definition = convertampersands
                print definition

            counter += 1

Пример #39

0

Показать файл

Файл: currency_xe.py Проект: kjk/moriarty-palm

def parseCurrencyData(htmlText):
    global _g_xercRe
    htmlText = _g_xercRe.sub("", htmlText)
    htmlText = htmlText.replace("!BORDERCOLOR", "BORDERCOLOR")
    soup = BeautifulSoup()
    soup.feed(htmlText)
    tables = soup("table", {"class": "ictab"})
    assert 1 == len(tables)
    rows = tables[0].fetch("tr", {"valign": "top"})
    currencies = dict()
    for row in rows:
        cells = row.fetch("td")
        if 4 != len(cells):
            continue
        currencies[str(cells[0].contents[0]).strip()] = float(str(cells[3].contents[0]).strip().replace(",", ""))
    return (RESULTS_DATA, currencies)

Пример #40

0

Показать файл

Файл: currency_xe.py Проект: AglaianWoman/moriarty-palm

def parseCurrencyData(htmlText):
    global _g_xercRe
    htmlText = _g_xercRe.sub("", htmlText)
    htmlText = htmlText.replace("!BORDERCOLOR", "BORDERCOLOR")
    soup = BeautifulSoup()
    soup.feed(htmlText)
    tables = soup("table", {"class": "ictab"})
    assert 1 == len(tables)
    rows = tables[0].fetch("tr", {"valign": "top"})
    currencies = dict()
    for row in rows:
        cells = row.fetch("td")
        if 4 != len(cells):
            continue
        currencies[str(cells[0].contents[0]).strip()] = float(
            str(cells[3].contents[0]).strip().replace(",", ""))
    return (RESULTS_DATA, currencies)

Пример #41

0

Показать файл

def reversePhoneLookup(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    noResults = m411testNoResultsReversePhoneLookup(soup)
    if NO_RESULTS == noResults:
        return (noResults,m411NoResultsText)
    tdWithResults = soup.first("td",{"class":"TTLPREF"})
    if not tdWithResults:
        tdWithResults = soup.first("span",{"class":"TTLPREF"})
    if not tdWithResults:
        return (UNKNOWN_FORMAT,m411UnknownFormatText)
    # results are inside <td>
    fontColor = tdWithResults.first("font")
    if fontColor:
        # "No details available."
        counter = 0
        for br in tdWithResults.fetch("br"):
            # we belive that after 6th <br> is city
            if counter == 5:
                city =  "%s" % str(br.next).replace("\n","").strip()
                returned.append(["","",city,""])
            counter += 1
    else:
        # all data, or city & phone
        counter = 0
        person = ""
        address = ""
        city = ""
        phone = ""
        for br in tdWithResults.fetch("br"):
            # 7 <br> in <td> 
            if 1 == counter:
                if not isinstance(br.next,Tag):
                    person = "%s" % str(br.next).replace("\n","").strip()
            if 2 == counter:
                if not isinstance(br.next,Tag):
                    address = "%s" % str(br.next).replace("\n","").strip()
            if 3 == counter:
                city = "%s" % str(br.next).replace("\n","").strip()
            if 4 == counter:
                phone = "%s" % str(br.next).replace("\n","").strip()
            counter += 1
        returned.append((person,address,city,phone))    
    return (RESULTS_DATA,universalDataFormatReplaceEntities(returned))

Пример #42

0

Показать файл

Файл: quotes.py Проект: kjk/moriarty-palm

def parseRandomQuotes(htmlTxt, modulesInfo):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    quotes = []
    dtList = soup.fetch("dt", {"class":"quote"})
    ddList = soup.fetch("dd", {"class":"author"})
    if len(dtList) == len(ddList) and len(dtList) > 0:
        for i in range(len(ddList)):
            quote = getAllTextFromTag(dtList[i])
            next = ddList[i]
            bItem = None
            while next and None == bItem:
                next = next.next
                if isinstance(next, Tag):
                    if next.name == "b":
                        bItem = next
                    elif next.name == "dt":
                        next = None
                    elif next.name == "select":
                        next = None
            if bItem:
                aItem = bItem.first("a")
                if aItem:
                    author = getAllTextFromTag(aItem)
                else:
                    author = getAllTextFromTag(bItem)
            quotes.append([author, "\""+quote.strip()+"\""])

    if 0 == len(quotes):
        return UNKNOWN_FORMAT, None
    # build definition
    df = Definition()

    te = df.TextElement("Random Quotes", style=styleNamePageTitle)
    te.setJustification(justCenter)
    df.LineBreakElement()
    addQuotesToDefinition(df, quotes, modulesInfo)
    df.LineBreakElement()
    par = df.ParagraphElement(False)
    par.setJustification(justCenter)
    df.TextElement("Daily", link="s+quotes:daily")
    df.TextElement(" \x95 ", style=styleNameGray)
    df.TextElement("Random", link="s+quotes:random")
    df.PopParentElement()
    return QUOTES_DATA, universalDataFormatWithDefinition(df, [])

Пример #43

0

Показать файл

Файл: jokes.py Проект: kjk/moriarty-palm

def parseJoke(htmlTxt):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    table = soup.first("table", {"width": "328", "id": "Table2"})
    if not table:
        return (UNKNOWN_FORMAT, jUnknownFormatText)
    tdList = table.fetch("td", {"colspan": "3", "valign": "top", "class": "body"})
    if 3 != len(tdList):
        return (UNKNOWN_FORMAT, jUnknownFormatText)

    # simple format - simple parser
    title = getAllTextFromTag(tdList[0]).strip()
    text = getAllTextFromToInBrFormat(tdList[1], tdList[2].previous)
    smallList = [title, text]
    # add rating information
    if len(title) + len(text) > 16:  # in random joke sometimes it returns small nothing... so to be sure
        span = soup.first("span", {"class": "body"})
        if span:
            text = getAllTextFromTag(span).replace("\n", "").strip()
            img = span.first("img", {"src": "%"})
            if text.startswith("CURRENT RATING") and img:
                src = img["src"]
                src = src.split("/")[-1]
                src = src.replace(".gif", "")
                translator = {
                    "iconrate_one": "1",
                    "iconrate_two": "2",
                    "iconrate_three": "3",
                    "iconrate_four": "4",
                    "iconrate_five": "5",
                    "iconrate_one_half": "1.5",
                    "iconrate_two_half": "2.5",
                    "iconrate_three_half": "3.5",
                    "iconrate_four_half": "4.5",
                    "iconrate_zero_half": "0.5",
                }
                rating = "not rated"
                try:
                    rating = translator[src]
                except:
                    pass
                smallList.append(rating)
    outerList = [smallList]
    return (JOKE_DATA, universalDataFormatReplaceEntities(outerList))

Пример #44

0

Показать файл

def parseRandomQuotes(htmlTxt, modulesInfo):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    quotes = []
    dtList = soup.fetch("dt", {"class": "quote"})
    ddList = soup.fetch("dd", {"class": "author"})
    if len(dtList) == len(ddList) and len(dtList) > 0:
        for i in range(len(ddList)):
            quote = getAllTextFromTag(dtList[i])
            next = ddList[i]
            bItem = None
            while next and None == bItem:
                next = next.next
                if isinstance(next, Tag):
                    if next.name == "b":
                        bItem = next
                    elif next.name == "dt":
                        next = None
                    elif next.name == "select":
                        next = None
            if bItem:
                aItem = bItem.first("a")
                if aItem:
                    author = getAllTextFromTag(aItem)
                else:
                    author = getAllTextFromTag(bItem)
            quotes.append([author, "\"" + quote.strip() + "\""])

    if 0 == len(quotes):
        return UNKNOWN_FORMAT, None
    # build definition
    df = Definition()

    te = df.TextElement("Random Quotes", style=styleNamePageTitle)
    te.setJustification(justCenter)
    df.LineBreakElement()
    addQuotesToDefinition(df, quotes, modulesInfo)
    df.LineBreakElement()
    par = df.ParagraphElement(False)
    par.setJustification(justCenter)
    df.TextElement("Daily", link="s+quotes:daily")
    df.TextElement(" \x95 ", style=styleNameGray)
    df.TextElement("Random", link="s+quotes:random")
    df.PopParentElement()
    return QUOTES_DATA, universalDataFormatWithDefinition(df, [])

Пример #45

0

Показать файл

Файл: cal2mail.py Проект: bgnori/tonic

  def get(self, url, now=None):
    #ym=2009.3
    #vmode=itiran
    if now is None:
      now = dt.now()
    tomorrow = now + delta(1)

    option = urllib.urlencode(dict(
          ym='%i.%i'%(now.year, now.month),
          vmode='itiran'
          ))
    url += '?' + option
    self.write('getting feed from "%s".\n'%(url))

    p = Parser()
    f = urllib.urlopen(url)
    try:
      p.feed(f.read().decode('Shift-JIS'))
      p.goahead(0)
    finally:
      f.close()

    pages = []
    for a in p.findAll('a', attrs=dict(href="javaScript:void(0)")):
      m = parseOnClick.search(a['onclick'])
      if m:
        d = m.groupdict()
        #print d['year'], d['month'], d['day'], d['id']
        memo = '''http://www.backgammon.gr.jp/EventSchedule/calendar/calendar.cgi'''
        if int(d['day']) == tomorrow.day:
          option = urllib.urlencode(dict(
              action='memo',
              yy=d['year'],
              mm=d['month'],
              dd=d['day'],
              id=d['id'],
              ))
          f = urllib.urlopen(memo + '?' + option)
          try:
            uhtml = f.read().decode('Shift-JIS')
          finally:
            f.close()
          pages.append(Item(self, uhtml))
    return pages

Пример #46

0

Показать файл

def parseCurrencyData(htmlText):
    global _g_metaRe
    htmlText = _g_metaRe.sub("", htmlText)
    soup = BeautifulSoup()
    soup.feed(htmlText)
    tables = soup("table", {
        "cellpadding": "2",
        "cellspacing": "0",
        "border": "0",
        "width": "468"
    })
    assert 1 == len(tables)
    rows = tables[0].fetch("tr", {"bgcolor": "#FFFFFF"})
    currencies = dict()
    for row in rows:
        fonts = row.fetch("font")
        currencies[str(fonts[2].contents[0])] = float(str(
            fonts[3].contents[0]))
    return (RESULTS_DATA, currencies)

Пример #47

0

Показать файл

Файл: plugin.py Проект: nixon/boombot

 def cve(self,irc,msg,args):
     word= self._prepare_term(args[0],"-")
     if re.search('cve', word, re.IGNORECASE) == None:
         url = 'http://cve.mitre.org/cgi-bin/cvekey.cgi?keyword=' + word
         category = 'keyword'
     else:
         url = 'http://cve.mitre.org/cgi-bin/cvename.cgi?name=' +word
         category = 'name'
     # Read the URL and pass it to BeautifulSoup.
     html = urllib2.urlopen(url).read()
     soup = BeautifulSoup()
     soup.feed(html)
     cveroot = "http://cve.mitre.org"
     # Read the main table, extracting the words from the table cells.
     hreftable = soup.fetch('a', {'href':re.compile('cvename')}, limit=4)
     h1table = soup.fetch('h1')
     h1string = str(h1table)
     if category == 'keyword':
         fonttable = soup.fetch('font', limit=11)
     else:
         fonttable = soup.fetch('font', limit=17)
     if (len(fonttable) == 3) or (re.search('error', h1string, re.IGNORECASE) != None):
         irc.reply("No data found regarding " + word)
     else:
         cve = []
         href = []
         ret = ''
         for line in fonttable:
             string = str(line)
             cve.append(re.sub('^.*">|</font>|\\n', '', string))
         for line in hreftable:
             string = str(line)
             splitstring = string.split('>')
             #print splitstring
             href.append(re.sub('^.*="|"', '', splitstring[0]))
         ret =  "%s %s" % (cve[3], cve[4])
         if category == 'keyword':
             for link in href:
                 ret += cveroot + link + " "
         else:
             ret +=cve[8]
         irc.reply(ret)

Пример #48

0

Показать файл

Файл: dreams.py Проект: AglaianWoman/moriarty-palm

def parseDream(htmlTxt):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)

    tdMain = soup.first(
        "td", {
            "width": "437",
            "height": "1",
            "colspan": "3",
            "valign": "top",
            "rowspan": "2",
            "align": "left"
        })
    if not tdMain:
        return (UNKNOWN_FORMAT, dUnknownFormatText)

    fList = tdMain.fetch("font", {
        "face": "Arial",
        "size": "4",
        "color": "#6500CA"
    })
    definitionsCount = len(fList) - 3
    if 0 >= definitionsCount:
        return (NO_RESULTS, dNoResultsText)

    outerList = []
    realDefinitionsCount = 0
    for fItem in fList[:-3]:
        bItem = fItem.first("b")
        if bItem:
            itemTitle = getAllTextFromTag(bItem).replace("\n", "").strip()
            itemText = getAllTextFromToInBrFormat(
                getLastElementFromTag(fItem).next,
                fList[realDefinitionsCount + 1])
            itemText = itemText.replace("\n", "").strip()
            outerList.append((itemTitle, itemText))
            realDefinitionsCount += 1

    if 0 == realDefinitionsCount:
        return (UNKNOWN_FORMAT, dUnknownFormatText)

    return (DREAM_DATA, universalDataFormatReplaceEntities(outerList))

Пример #49

0

Показать файл

Файл: m411_by411.py Проект: kjk/moriarty-palm

def areaCodeByCity(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    res = testNoResults(soup)
    if RESULTS_DATA != res:
        return (res, None)

    resultsTable = soup.first("table", {"summary": "Results Content"})
    if resultsTable:
        strong = resultsTable.first("strong")
        if strong:
            if getAllTextFromTag(strong).startswith("Multiple cities with"):
                aList = resultsTable.fetch("a")
                for aItem in aList:
                    city = getAllTextFromTag(aItem)
                    returned.append(city)
                if len(returned) == 0:
                    return (UNKNOWN_FORMAT, m411UnknownFormatText)
                return (MULTIPLE_SELECT, string.join(returned, "\n"))
    # results
    return reverseZIPCodeLookup(htmlTxt)

    tables = soup.fetch("table", {"summary": "Search Results"})
    if 0 == len(tables):
        return (UNKNOWN_FORMAT, m411UnknownFormatText)
    trList = []
    for tab in tables:
        trList += tab.fetch("tr")
    for tr in trList:
        tdList = tr.fetch("td")
        if len(tdList) == 3:
            code = getAllTextFromTag(tdList[0]).strip()
            country = getAllTextFromTag(tdList[1]).strip()
            timezone = getAllTextFromTag(tdList[2]).strip()
            if code != "New Search":
                smallList = (code, country, timezone)
                returned.append(smallList)
    if len(returned) == 0:
        return (UNKNOWN_FORMAT, m411UnknownFormatText)
    return (RESULTS_DATA, universalDataFormatReplaceEntities(returned))

Пример #50

0

Показать файл

def areaCodeByCity(htmlTxt):
    returned = []
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    res = testNoResults(soup)
    if RESULTS_DATA != res:
        return (res, None)

    resultsTable = soup.first("table", {"summary":"Results Content"})
    if resultsTable:
        strong = resultsTable.first("strong")
        if strong:
            if getAllTextFromTag(strong).startswith("Multiple cities with"):
                aList = resultsTable.fetch("a")
                for aItem in aList:
                    city = getAllTextFromTag(aItem)
                    returned.append(city)
                if len(returned) == 0:
                    return (UNKNOWN_FORMAT,m411UnknownFormatText)
                return (MULTIPLE_SELECT,string.join(returned,"\n"))
    # results
    return reverseZIPCodeLookup(htmlTxt)

    tables = soup.fetch("table", {"summary":"Search Results"})
    if 0 == len(tables):
        return (UNKNOWN_FORMAT,m411UnknownFormatText)
    trList = []
    for tab in tables:
        trList += tab.fetch("tr")
    for tr in trList:
        tdList = tr.fetch("td")
        if len(tdList) == 3:
            code     = getAllTextFromTag(tdList[0]).strip()
            country  = getAllTextFromTag(tdList[1]).strip()
            timezone = getAllTextFromTag(tdList[2]).strip()
            if code != "New Search":
                smallList = (code,country,timezone)
                returned.append(smallList)
    if len(returned) == 0:
        return (UNKNOWN_FORMAT,m411UnknownFormatText)
    return (RESULTS_DATA,universalDataFormatReplaceEntities(returned))

Пример #51

0

Показать файл

Файл: jokes.py Проект: kjk/moriarty-palm

def _parseList(htmlTxt):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    trList = soup.fetch("tr")
    outerList = []
    for tr in trList:
        if len(tr.fetch("tr")) == 0:
            tdList = tr.fetch("td")
            if len(tdList) == 4:
                if tdList[0].first("span", {"class": "title"}):
                    rank = getAllTextFromTag(tdList[0])
                    title = getAllTextFromTag(tdList[1])
                    rating = getAllTextFromTag(tdList[2])
                    explicitness = getAllTextFromTag(tdList[3])
                    aItem = tdList[1].first("a")
                    if aItem:
                        url = aItem["href"]
                        outerList.append((rank, title, rating, explicitness, url))
    if 0 == len(outerList):
        return (NO_RESULTS, jNoResultsText)
    return (JOKES_LIST, universalDataFormatReplaceEntities(outerList))

Пример #52

0

Показать файл

def tryParseSearchDefinition(htmlTxt, fArtistSearch, modulesInfo, keywords):
    soup = BeautifulSoup()
    soup.feed(htmlTxt)
    # no results
    input = soup.first("input", {"name": "albumName"})
    if input:
        return NO_RESULTS, None

    # get td's
    headerList = soup.fetch("td", {"class": "tb_header"})
    tdList = soup.fetch("td", {"class": "tb_row_r2"})
    if len(headerList) == 0 or len(tdList) == 0:
        return UNKNOWN_FORMAT, None
    # test modulo offset
    headersCount = len(headerList)
    if (len(tdList) % headersCount) != 0:
        return UNKNOWN_FORMAT, None

    searchResults = []
    # get results
    for index in range(len(tdList) - 1):
        artist = getAllTextFromTag(tdList[index]).strip()
        title = getAllTextFromTag(tdList[index + 1]).strip()
        urlStart = "show.php?id="
        aItem = tdList[index + 1].first("a", {"href": urlStart + "%"})
        if aItem:
            lyricsId = aItem['href'][len(urlStart):]
            searchResults.append([artist, title, lyricsId])

    if 0 == len(searchResults):
        return (UNKNOWN_FORMAT, None)

    if fArtistSearch:
        df = searchResultsToDefinitionThree(searchResults, modulesInfo)
    else:
        #df = searchResultsToDefinition(searchResults, modulesInfo)
        df = searchResultsToDefinitionTwo(searchResults, modulesInfo)

    return LYRICS_SEARCH, universalDataFormatWithDefinition(
        df, [["H", "Search: " + keywords]])

Пример #53

0

Показать файл

 def define(self, irc, msg, args):
     """[word]
     look up the word in wordnet"""
     if len(args) != 1:
         irc.reply("you gotta give me a word to define")
         return
     word = self._prepare_term(args[0], "")
     url = 'http://wordnet.princeton.edu/perl/webwn?s=' + word
     html = urllib2.urlopen(url).read()
     soup = BeautifulSoup()
     soup.feed(html)
     maintable = soup.fetch('li')
     retdef = []
     checkfordefs = len(maintable)
     if checkfordefs != 0:
         for lines in maintable:
             converttostring = str(lines)
             definition = re.sub('^.*\(|\).*$', '', converttostring)
             retdef.append(definition)
     else:
         retdef.append("not found.  Is %s spelled corectly?" % word)
     irc.reply(word + ": " + "; ".join(retdef))

Python BeautifulSoup.feed примеры использования