Пример #1
0
def parse_data(data):
    page = BeautifulSoup(data)

    results = page.find("div", id="res")
    if results is None:
        raise NoResultsException

    calc = results.find("img", src="/images/icons/onebox/calculator-40.gif")
    if calc is not None:
        calc = results.find("h2", {"class": "r"})
        if calc is not None:
            superscripts = calc.find_all("sup")
            if superscripts is not None and len(superscripts):
                for x in superscripts:
                    x.contents[0].replaceWith("^" + x.contents[0])
            return [dict(type="string", string=util.strip_html(calc).decode("utf-8"))]

    nresults = results.find_all("li", {"class": "g"})
    if len(nresults) == 0:
        raise NoResultsException

    processed_results = []
    for x in nresults:
        a_tag = x.find("a")
        if a_tag is not None:
            processed_results.append(
                dict(type="result", href=urlparse.parse_qs(urlparse.urlparse(a_tag["href"]).query)["q"][0],
                     text=util.strip_html(a_tag).decode("utf-8")))

    return processed_results
Пример #2
0
def parse_data(data):
    page = BeautifulSoup(data)

    results = page.find("div", id="res")
    if results is None:
        raise NoResultsException

    calc = results.find("img", src="/images/icons/onebox/calculator-40.gif")
    if calc is not None:
        calc = results.find("h2", {"class": "r"})
        if calc is not None:
            superscripts = calc.find_all("sup")
            if superscripts is not None and len(superscripts):
                for x in superscripts:
                    x.contents[0].replaceWith("^" + x.contents[0])
            return [
                dict(type="string",
                     string=util.strip_html(calc).decode("utf-8"))
            ]

    nresults = results.find_all("li", {"class": "g"})
    if len(nresults) == 0:
        raise NoResultsException

    processed_results = []
    for x in nresults:
        a_tag = x.find("a")
        if a_tag is not None:
            processed_results.append(
                dict(type="result",
                     href=urlparse.parse_qs(
                         urlparse.urlparse(a_tag["href"]).query)["q"][0],
                     text=util.strip_html(a_tag).decode("utf-8")))

    return processed_results
Пример #3
0
def fetch_title(callback, m):
    url = m.group()

    try:
        # SSL verify is for pussys who think PKI works ;)
        # it also shouldn't really matter for this particular
        # module, but stops anton from failing at requesting
        # some internal websites
        r = requests.get(url, verify=False)
    except RequestException as e:
        # we catch this so it doesn't bubble up as usually someone
        # just posted a malformed URL to IRC
        return "nope, didn't get it (%s)" % str(e)

    if r.status_code != requests.codes.ok:
        return

    if not (r.headers['Content-type'].startswith('text') or
            r.headers['Content-type'].startswith('application/xml')):
        return

    # BeautifulSoup's objection to being passed something like
    # a JPG as a unicode string seems to be to raise a UnicodeEncodeError.
    # I could catch that, but it feels nasty. Mind you, so does this...
    # (test-case: "http://jacovanstaden.files.wordpress.com/2011/03/git-flow-overview.jpg")
    try:
        if r.text[:1] != '<':
            return
        page = BeautifulSoup(r.text)
    except HTMLParser.HTMLParseError:
        return "Could not parse %s with BeautifulSoup. Shun the author." % url
    except TypeError:
        # This seems to be the case if r.text is, for example, an image. This can
        # still happen if a site sends a malformed Content-type header, but it
        # should be rare.
        return

    result = page.find("title")
    if result is not None:
        title = util.strip_html(result).decode("utf-8")
        if len(title) > 200:
            title = "%s..." % title[:197]
        return title

    return "Untitled (no <title> tag found)"
Пример #4
0
def parse_html(data):
    page = soup(data)

    t = page.find("a", {"class": "h1a"})
    return util.strip_html(t)
Пример #5
0
def parse_html(data):
    page = soup(data)

    t = page.find("a", {"class": "h1a"})
    return util.strip_html(t)