def extract(source, filters): filters = dict(filter.split("=") for filter in filters) s = fetch_url(source)[1] if is_url(source) else open(source, "r").read() doc = parse_html(s) result = {} for k, v in filters.items(): es = doc.cssselect(v) if "." in k: k, a = k.split(".") texts = htmls = [e.attrib.get(a, "") for e in es] else: htmls = [doc_to_str(e) for e in es] texts = [doc_to_text(e) for e in es] result["_{0:s}".format(k)] = htmls result[k] = texts return result
def doc_to_text(doc): return unichar_to_text(html_to_text(unescape(doc_to_str(doc))))