Пример #1
0
def remove_nodes_via_regex(doc, pattern):
    for selector in ['id', 'class']:
        reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern)
        naughty_list = doc.xpath(reg, namespaces={'re': REGEX_NS})
        for node in naughty_list:
            parser.remove(node)
    return doc
Пример #2
0
def get_replacement_nodes(doc, div):
    replacement_text = []
    nodes_to_return = []
    nodes_to_remove = []
    childs = parser.child_nodes_with_text(div)
    for kid in childs:
        # node is a p
        # and already have some replacement text
        if parser.get_tag(kid) == 'p' and len(replacement_text) > 0:
            new_node = get_flushed_buffer(''.join(replacement_text),
                                               doc)
            nodes_to_return.append(new_node)
            replacement_text = []
            nodes_to_return.append(kid)
        # node is a text node
        elif parser.is_text_node(kid):
            kid_text_node = kid
            kid_text = parser.get_text(kid)
            replace_text = kid_text
            for p, w in TABS_AND_NEWLINES:
                replace_text = replace_text.replace(p, w)
            if len(replace_text) > 1:
                prev_sib_node = parser.previous_sibling(kid_text_node)
                while prev_sib_node is not None \
                    and parser.get_tag(prev_sib_node) == "a" \
                    and parser.get_attribute(prev_sib_node, 'usedalready') != 'yes':
                    outer = " " + parser.outer_html(prev_sib_node) + " "
                    replacement_text.append(outer)
                    nodes_to_remove.append(prev_sib_node)
                    parser.set_attribute(prev_sib_node,
                                         attr='usedalready',
                                         value='yes')
                    prev = parser.previous_sibling(prev_sib_node)
                    prev_sib_node = prev if prev is not None else None
                # append replace_text
                replacement_text.append(replace_text)
                #
                next_Sib_node = parser.next_sibling(kid_text_node)
                while next_Sib_node is not None \
                    and parser.get_tag(next_Sib_node) == "a" \
                    and parser.get_attribute(next_Sib_node, 'usedalready') != 'yes':
                    outer = " " + parser.outer_html(next_Sib_node) + " "
                    replacement_text.append(outer)
                    nodes_to_remove.append(next_Sib_node)
                    parser.set_attribute(next_Sib_node,
                                         attr='usedalready',
                                         value='yes')
                    next = parser.next_sibling(next_Sib_node)
                    prev_sib_node = next if next is not None else None
        # otherwise
        else:
            nodes_to_return.append(kid)
    # flush out anything still remaining
    if len(replacement_text) > 0:
        new_node = get_flushed_buffer(''.join(replacement_text), doc)
        nodes_to_return.append(new_node)
        replacement_text = []
    for n in nodes_to_remove:
        parser.remove(n)
    return nodes_to_return
Пример #3
0
def is_table_tag_and_no_paragraphs_exist(e):
    sub_paragraphs = parser.get_elements_by_tag(e, tag='p')
    for p in sub_paragraphs:
        txt = parser.get_text(p)
        if len(txt) < 25:
            parser.remove(p)
    sub_paragraphs2 = parser.get_elements_by_tag(e, tag='p')
    if len(sub_paragraphs2) == 0 and e.tag is not "td":
        return True
    return False
Пример #4
0
def cleanup(target_node):
    """
    Remove any divs that looks like non-content,
    Clusters of links, or paras with no gusto.
    """
    node = add_siblings(target_node)
    for e in node.getchildren():
        if e.tag != 'p':
            if is_high_link_density(e) \
                or is_table_tag_and_no_paragraphs_exist(e) \
                or not node_score_threshold_met(node, e):
                parser.remove(e)
    return node
Пример #5
0
def remove_paragraphs_with_few_words(top_node):
    """
    Remove paragraphs that have less than x number of words,  would
    indicate that it's some sort of link.
    """
    all_nodes = parser.get_elements_by_tags(top_node, ['*'])
    all_nodes.reverse()
    for el in all_nodes:
        text = parser.get_text(el)
        stop_words = StopWords().get_stop_word_count(text)
        if stop_words.get_stop_word_count() < 3 \
            and len(parser.get_elements_by_tag(el, tag='object')) == 0 \
            and len(parser.get_elements_by_tag(el, tag='embed')) == 0:
            parser.remove(el)
        # TODO: Check if it is in the right place.
        else:
            trimmed = parser.get_text(el)
            if trimmed.startswith("(") and trimmed.endswith(")"):
                parser.remove(el)
Пример #6
0
def remove_script_and_style(doc):
    # remove scripts
    scripts = parser.get_elements_by_tag(doc, tag='script')
    for item in scripts:
        parser.remove(item)
    # remove styles
    styles = parser.get_elements_by_tag(doc, tag='style')
    for item in styles:
        parser.remove(item)
    # remove comments
    comments = parser.get_comments(doc)
    for item in comments:
        parser.remove(item)
    return doc
Пример #7
0
def clean_bad_tags(doc):
    # ids
    naughty_list = doc.xpath(QUERY_IDS, namespaces={'re': REGEX_NS})
    for node in naughty_list:
        parser.remove(node)
    # class
    naughty_classes = doc.xpath(QUERY_CLASSES,
                                namespaces={'re': REGEX_NS})
    for node in naughty_classes:
        parser.remove(node)
    # name
    naughty_names = doc.xpath(QUERY_NAMES,
                              namespaces={'re': REGEX_NS})
    for node in naughty_names:
        parser.remove(node)
    return doc