示例#1
0
def sibling_content(current_sibling, sibling_paragraphs_baseline_score):
    """
    Adds any siblings that may have a decent score to this node
    """
    if current_sibling.tag == 'p' \
            and len(parser.get_text(current_sibling)) > 0:
        e0 = current_sibling
        if e0.tail:
            e0 = copy.deepcopy(e0)
            e0.tail = ''
        return [e0]
    else:
        potential_paragraphs = parser.get_elements_by_tag(current_sibling, tag='p')
        if potential_paragraphs is None:
            return None
        else:
            ps = []
            for first_paragraph in potential_paragraphs:
                text = parser.get_text(first_paragraph)
                if len(text) > 0:
                    word_stats = StopWords().get_stop_word_count(text)
                    paragraph_score = word_stats.get_stop_word_count()
                    sibling_baseline_score = float(.30)
                    high_link_density = is_high_link_density(first_paragraph)
                    score = float(sibling_paragraphs_baseline_score * sibling_baseline_score)
                    if score < paragraph_score and not high_link_density:
                        p = parser.create_element(tag='p', text=text, tail=None)
                        ps.append(p)
            return ps
示例#2
0
def calculate_best_node_based_on_clustering(doc):
    top_node = None
    check_nodes = nodes_to_check(doc)
    starting_boost = float(1.0)
    cnt = 0
    i = 0
    parent_nodes = set()
    nodes_with_text = []
    for node in check_nodes:
        node_text = parser.get_text(node)
        word_stats = StopWords().get_stop_word_count(node_text)
        high_link_density = is_high_link_density(node)
        if word_stats.get_stop_word_count() > 2 and not high_link_density:
            nodes_with_text.append(node)
    number_of_nodes = len(nodes_with_text)
    negative_scoring = 0
    bottom_nodes_for_negative_score = float(number_of_nodes) * 0.25
    for node in nodes_with_text:
        boost_score = float(0)
        # boost
        if ok_to_boost(node):
            if cnt >= 0:
                boost_score = float((1.0 / starting_boost) * 50)
                starting_boost += 1
        # number_of_nodes
        if number_of_nodes > 15:
            if (number_of_nodes - i) <= bottom_nodes_for_negative_score:
                booster = float(bottom_nodes_for_negative_score - \
                        (number_of_nodes - i))
                boost_score = float(-pow(booster, float(2)))
                negscore = -abs(boost_score) + negative_scoring
                if negscore > 40:
                    boost_score = float(5)
        node_text = parser.get_text(node)
        word_stats = StopWords().get_stop_word_count(node_text)
        upscore = int(word_stats.get_stop_word_count() + boost_score)
        # parent node
        parent_node = parser.get_parent(node)
        update_score(parent_node, upscore)
        update_node_count(node.getparent(), 1)
        if node.getparent() not in parent_nodes:
            parent_nodes.add(node.getparent())
        # parentparent node
        parent_parent_node = parser.get_parent(parent_node)
        if parent_parent_node is not None:
            update_node_count(parent_parent_node, 1)
            update_score(parent_parent_node, upscore / 2)
            if parent_parent_node not in parent_nodes:
                parent_nodes.add(parent_parent_node)
        cnt += 1
        i += 1
    top_node_score = 0
    for e in parent_nodes:
        score = get_score(e)
        if score > top_node_score:
            top_node = e
            top_node_score = score
        if top_node is None:
            top_node = e
    return top_node
示例#3
0
def ok_to_boost(node):
    """\
    A lot of times the first paragraph might be the caption under an image
    so we'll want to make sure if we're going to boost a parent node that
    it should be connected to other paragraphs,
    at least for the first n paragraphs so we'll want to make sure that
    the next sibling is a paragraph and has at
    least some substatial weight to it
    """
    para = "p"
    steps_away = 0
    minimum_stop_word_count = 5
    max_steps_away_from_node = 3
    nodes = walk_siblings(node)
    for current_node in nodes:
        # p
        if current_node.tag == para:
            if steps_away >= max_steps_away_from_node:
                return False
            para_text = parser.get_text(current_node)
            word_stats = StopWords().get_stop_word_count(para_text)
            if word_stats.get_stop_word_count > minimum_stop_word_count:
                return True
            steps_away += 1
    return False
示例#4
0
def title(doc):
    """Return the document's title."""
    title = ''
    title_elem = parser.get_elements_by_tag(doc, tag='title')
    # no title found
    if title_elem is None or len(title_elem) == 0:
        return title
    # title elem found
    title_text = parser.get_text(title_elem[0])
    used_delimeter = False
    # split title with |
    if '|' in title_text:
        title_text = _split_title(title_text, "\\|")
        used_delimeter = True
    # split title with -
    if not used_delimeter and '-' in title_text:
        title_text = _split_title(title_text, " - ")
        used_delimeter = True
    # split title with »
    if not used_delimeter and u'»' in title_text:
        title_text = _split_title(title_text, "»")
        used_delimeter = True
    # split title with :
    if not used_delimeter and ':' in title_text:
        title_text = _split_title(title_text, ":")
        used_delimeter = True
    title = title_text.replace("&#65533;", "")
    return title
示例#5
0
文件: cleaner.py 项目: kolanos/gander
def get_replacement_nodes(doc, div):
    replacement_text = []
    nodes_to_return = []
    nodes_to_remove = []
    childs = parser.child_nodes_with_text(div)
    for kid in childs:
        # node is a p
        # and already have some replacement text
        if parser.get_tag(kid) == 'p' and len(replacement_text) > 0:
            new_node = get_flushed_buffer(''.join(replacement_text),
                                               doc)
            nodes_to_return.append(new_node)
            replacement_text = []
            nodes_to_return.append(kid)
        # node is a text node
        elif parser.is_text_node(kid):
            kid_text_node = kid
            kid_text = parser.get_text(kid)
            replace_text = kid_text
            for p, w in TABS_AND_NEWLINES:
                replace_text = replace_text.replace(p, w)
            if len(replace_text) > 1:
                prev_sib_node = parser.previous_sibling(kid_text_node)
                while prev_sib_node is not None \
                    and parser.get_tag(prev_sib_node) == "a" \
                    and parser.get_attribute(prev_sib_node, 'usedalready') != 'yes':
                    outer = " " + parser.outer_html(prev_sib_node) + " "
                    replacement_text.append(outer)
                    nodes_to_remove.append(prev_sib_node)
                    parser.set_attribute(prev_sib_node,
                                         attr='usedalready',
                                         value='yes')
                    prev = parser.previous_sibling(prev_sib_node)
                    prev_sib_node = prev if prev is not None else None
                # append replace_text
                replacement_text.append(replace_text)
                #
                next_Sib_node = parser.next_sibling(kid_text_node)
                while next_Sib_node is not None \
                    and parser.get_tag(next_Sib_node) == "a" \
                    and parser.get_attribute(next_Sib_node, 'usedalready') != 'yes':
                    outer = " " + parser.outer_html(next_Sib_node) + " "
                    replacement_text.append(outer)
                    nodes_to_remove.append(next_Sib_node)
                    parser.set_attribute(next_Sib_node,
                                         attr='usedalready',
                                         value='yes')
                    next = parser.next_sibling(next_Sib_node)
                    prev_sib_node = next if next is not None else None
        # otherwise
        else:
            nodes_to_return.append(kid)
    # flush out anything still remaining
    if len(replacement_text) > 0:
        new_node = get_flushed_buffer(''.join(replacement_text), doc)
        nodes_to_return.append(new_node)
        replacement_text = []
    for n in nodes_to_remove:
        parser.remove(n)
    return nodes_to_return
示例#6
0
def convert_to_text(top_node):
    txts = []
    for node in list(top_node):
        txt = parser.get_text(node)
        if txt:
            txt = HTMLParser().unescape(txt)
            txts.append(inner_trim(txt))
    return '\n\n'.join(txts)
示例#7
0
def is_table_tag_and_no_paragraphs_exist(e):
    sub_paragraphs = parser.get_elements_by_tag(e, tag='p')
    for p in sub_paragraphs:
        txt = parser.get_text(p)
        if len(txt) < 25:
            parser.remove(p)
    sub_paragraphs2 = parser.get_elements_by_tag(e, tag='p')
    if len(sub_paragraphs2) == 0 and e.tag is not "td":
        return True
    return False
示例#8
0
def remove_paragraphs_with_few_words(top_node):
    """
    Remove paragraphs that have less than x number of words,  would
    indicate that it's some sort of link.
    """
    all_nodes = parser.get_elements_by_tags(top_node, ['*'])
    all_nodes.reverse()
    for el in all_nodes:
        text = parser.get_text(el)
        stop_words = StopWords().get_stop_word_count(text)
        if stop_words.get_stop_word_count() < 3 \
            and len(parser.get_elements_by_tag(el, tag='object')) == 0 \
            and len(parser.get_elements_by_tag(el, tag='embed')) == 0:
            parser.remove(el)
        # TODO: Check if it is in the right place.
        else:
            trimmed = parser.get_text(el)
            if trimmed.startswith("(") and trimmed.endswith(")"):
                parser.remove(el)
示例#9
0
def is_high_link_density(e):
    """
    Checks the density of links within a node, is there not much text and
    most of it contains linky shit? if so it's no good.
    """
    links = parser.get_elements_by_tag(e, tag='a')
    if links is None or len(links) == 0:
        return False
    text = parser.get_text(e)
    words = text.split(' ')
    number_of_words = float(len(words))
    sb = []
    for link in links:
        sb.append(parser.get_text(link))
    link_text = ''.join(sb)
    link_words = link_text.split(' ')
    number_of_link_words = float(len(link_words))
    number_of_links = float(len(links))
    link_divisor = float(number_of_link_words / number_of_words)
    score = float(link_divisor * number_of_links)
    if score >= 1.0:
        return True
    return False
示例#10
0
def tags(node):
    """Return anchors with rel="tag" attribute."""
    # Node doesn't have chidren
    if len(list(node)) == 0:
        return set()
    # Alternate selector: "a[rel=tag], a[href*=/tag/]"
    elements = node.cssselect("a[rel=tag]")
    if elements is None:
        return set()
    tags = []
    for el in elements:
        tag = parser.get_text(el)
        if tag:
            tags.append(tag)
    return set(tags)
示例#11
0
def siblings_baseline_score(top_node):
    """
    We could have long articles that have tons of paragraphs
    so if we tried to calculate the base score against
    the total text score of those paragraphs it would be unfair.
    So we need to normalize the score based on the average scoring
    of the paragraphs within the top node.
    For example if our total score of 10 paragraphs was 1000
    but each had an average value of 100 then 100 should be our base.
    """
    base = 100000
    number_of_paragraphs = 0
    score_of_paragraphs = 0
    nodes_to_check = parser.get_elements_by_tag(top_node, tag='p')
    for node in nodes_to_check:
        node_text = parser.get_text(node)
        word_stats = StopWords().get_stop_word_count(node_text)
        high_link_density = is_high_link_density(node)
        if word_stats.get_stop_word_count() > 2 and not high_link_density:
            number_of_paragraphs += 1
            score_of_paragraphs += word_stats.get_stop_word_count()
    if number_of_paragraphs > 0:
        base = score_of_paragraphs / number_of_paragraphs
    return base