def PLAIN(key, r): print "PLAIN", r, key if "content" in r.raw[key]: if key is not "stdin": charset = cp(r.raw[key]["content"]) if charset["encoding"] is "MacCyrillic": charset["encoding"] = "windows-1251" r.raw[key]["content"] = r.raw[key]["content"].decode(charset["encoding"]) r.raw[key]["plain"] = clean(r.raw[key]["content"], r.query["stop"]) del r.raw[key]["content"]
def HTML(key, r): print "HTML", r, key if "content" not in r.raw[key]: r.raw[key]["error"] = u"отсутствует контент" return try: charset = cp(r.raw[key]["content"]) if charset["encoding"] is "MacCyrillic": charset["encoding"] = "windows-1251" ucontent = r.raw[key]["content"].decode(charset["encoding"]) html = fromstring( ucontent.replace("</html>", "").replace("</HTML>", "").replace('encoding="' + charset["encoding"] + '"', "") + "</html>" ) r.raw[key]["content"] = ucontent except ValueError: html = fromstring(r.raw[key]["content"].replace("</html>", "").replace("</HTML>", "") + "</html>") except: r.raw[key]["error"] = u"ошибка декодирования html" return try: for tg in ["link", "noindex", "script", "style"]: map(lambda x: x.drop_tree(), html.xpath("//" + tg)) r.raw[key]["body"] = clean(html.body.text_content(), r.query["stop"]) r.raw[key]["plain"] = clean(html.text_content(), r.query["stop"]) for tg in ["title", ["h1", "h2", "h3", "h4", "h5", "h6"], ["b", "strong"]]: if type(tg) is type(""): title = html.xpath("//" + tg) tag = tg else: title = html.xpath("|".join(map(lambda x: "//" + x, tg))) tag = tg[0] + "-" + tg[len(tg) - 1] r.raw[key][tag] = ( " ".join(map(lambda x: clean(x.text_content(), r.query["stop"]), title)) if title else False ) if r.raw[key][tag] is False: del r.raw[key][tag] except: r.raw[key]["plain"] = clean(html.text_content(), r.query["stop"]) del r.raw[key]["content"]