示例#1
0
def PLAIN(key, r):
    print "PLAIN", r, key
    if "content" in r.raw[key]:
        if key is not "stdin":
            charset = cp(r.raw[key]["content"])
            if charset["encoding"] is "MacCyrillic":
                charset["encoding"] = "windows-1251"
            r.raw[key]["content"] = r.raw[key]["content"].decode(charset["encoding"])
        r.raw[key]["plain"] = clean(r.raw[key]["content"], r.query["stop"])
        del r.raw[key]["content"]
示例#2
0
def HTML(key, r):
    print "HTML", r, key
    if "content" not in r.raw[key]:
        r.raw[key]["error"] = u"отсутствует контент"
        return
    try:
        charset = cp(r.raw[key]["content"])
        if charset["encoding"] is "MacCyrillic":
            charset["encoding"] = "windows-1251"
        ucontent = r.raw[key]["content"].decode(charset["encoding"])
        html = fromstring(
            ucontent.replace("</html>", "").replace("</HTML>", "").replace('encoding="' + charset["encoding"] + '"', "")
            + "</html>"
        )
        r.raw[key]["content"] = ucontent
    except ValueError:
        html = fromstring(r.raw[key]["content"].replace("</html>", "").replace("</HTML>", "") + "</html>")
    except:
        r.raw[key]["error"] = u"ошибка декодирования html"
        return

    try:
        for tg in ["link", "noindex", "script", "style"]:
            map(lambda x: x.drop_tree(), html.xpath("//" + tg))
        r.raw[key]["body"] = clean(html.body.text_content(), r.query["stop"])
        r.raw[key]["plain"] = clean(html.text_content(), r.query["stop"])
        for tg in ["title", ["h1", "h2", "h3", "h4", "h5", "h6"], ["b", "strong"]]:
            if type(tg) is type(""):
                title = html.xpath("//" + tg)
                tag = tg
            else:
                title = html.xpath("|".join(map(lambda x: "//" + x, tg)))
                tag = tg[0] + "-" + tg[len(tg) - 1]
            r.raw[key][tag] = (
                " ".join(map(lambda x: clean(x.text_content(), r.query["stop"]), title)) if title else False
            )
            if r.raw[key][tag] is False:
                del r.raw[key][tag]

    except:
        r.raw[key]["plain"] = clean(html.text_content(), r.query["stop"])
    del r.raw[key]["content"]