def calculate_best_node_based_on_clustering(doc): top_node = None check_nodes = nodes_to_check(doc) starting_boost = float(1.0) cnt = 0 i = 0 parent_nodes = set() nodes_with_text = [] for node in check_nodes: node_text = parser.get_text(node) word_stats = StopWords().get_stop_word_count(node_text) high_link_density = is_high_link_density(node) if word_stats.get_stop_word_count() > 2 and not high_link_density: nodes_with_text.append(node) number_of_nodes = len(nodes_with_text) negative_scoring = 0 bottom_nodes_for_negative_score = float(number_of_nodes) * 0.25 for node in nodes_with_text: boost_score = float(0) # boost if ok_to_boost(node): if cnt >= 0: boost_score = float((1.0 / starting_boost) * 50) starting_boost += 1 # number_of_nodes if number_of_nodes > 15: if (number_of_nodes - i) <= bottom_nodes_for_negative_score: booster = float(bottom_nodes_for_negative_score - \ (number_of_nodes - i)) boost_score = float(-pow(booster, float(2))) negscore = -abs(boost_score) + negative_scoring if negscore > 40: boost_score = float(5) node_text = parser.get_text(node) word_stats = StopWords().get_stop_word_count(node_text) upscore = int(word_stats.get_stop_word_count() + boost_score) # parent node parent_node = parser.get_parent(node) update_score(parent_node, upscore) update_node_count(node.getparent(), 1) if node.getparent() not in parent_nodes: parent_nodes.add(node.getparent()) # parentparent node parent_parent_node = parser.get_parent(parent_node) if parent_parent_node is not None: update_node_count(parent_parent_node, 1) update_score(parent_parent_node, upscore / 2) if parent_parent_node not in parent_nodes: parent_nodes.add(parent_parent_node) cnt += 1 i += 1 top_node_score = 0 for e in parent_nodes: score = get_score(e) if score > top_node_score: top_node = e top_node_score = score if top_node is None: top_node = e return top_node
def sibling_content(current_sibling, sibling_paragraphs_baseline_score): """ Adds any siblings that may have a decent score to this node """ if current_sibling.tag == 'p' \ and len(parser.get_text(current_sibling)) > 0: e0 = current_sibling if e0.tail: e0 = copy.deepcopy(e0) e0.tail = '' return [e0] else: potential_paragraphs = parser.get_elements_by_tag(current_sibling, tag='p') if potential_paragraphs is None: return None else: ps = [] for first_paragraph in potential_paragraphs: text = parser.get_text(first_paragraph) if len(text) > 0: word_stats = StopWords().get_stop_word_count(text) paragraph_score = word_stats.get_stop_word_count() sibling_baseline_score = float(.30) high_link_density = is_high_link_density(first_paragraph) score = float(sibling_paragraphs_baseline_score * sibling_baseline_score) if score < paragraph_score and not high_link_density: p = parser.create_element(tag='p', text=text, tail=None) ps.append(p) return ps
def remove_paragraphs_with_few_words(top_node): """ Remove paragraphs that have less than x number of words, would indicate that it's some sort of link. """ all_nodes = parser.get_elements_by_tags(top_node, ['*']) all_nodes.reverse() for el in all_nodes: text = parser.get_text(el) stop_words = StopWords().get_stop_word_count(text) if stop_words.get_stop_word_count() < 3 \ and len(parser.get_elements_by_tag(el, tag='object')) == 0 \ and len(parser.get_elements_by_tag(el, tag='embed')) == 0: parser.remove(el) # TODO: Check if it is in the right place. else: trimmed = parser.get_text(el) if trimmed.startswith("(") and trimmed.endswith(")"): parser.remove(el)
def siblings_baseline_score(top_node): """ We could have long articles that have tons of paragraphs so if we tried to calculate the base score against the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of 100 then 100 should be our base. """ base = 100000 number_of_paragraphs = 0 score_of_paragraphs = 0 nodes_to_check = parser.get_elements_by_tag(top_node, tag='p') for node in nodes_to_check: node_text = parser.get_text(node) word_stats = StopWords().get_stop_word_count(node_text) high_link_density = is_high_link_density(node) if word_stats.get_stop_word_count() > 2 and not high_link_density: number_of_paragraphs += 1 score_of_paragraphs += word_stats.get_stop_word_count() if number_of_paragraphs > 0: base = score_of_paragraphs / number_of_paragraphs return base