def headings(doc): headers = [] nodes = parser.get_elements_by_tags(doc, ['h1', 'h2', 'h3', 'h4', 'h5']) for node in nodes: if node.text and node.text.strip(): headers.append(node.text.strip()) return headers
def convert_div_to_p(doc, dom_type): bad_divs = 0 else_divs = 0 divs = parser.get_elements_by_tag(doc, tag=dom_type) tags = ['a', 'blockquote', 'dl', 'div', 'img', 'ol', 'p', 'pre', 'table', 'ul'] for div in divs: items = parser.get_elements_by_tags(div, tags) if div is not None and len(items) == 0: replace_elements_with_p(doc, div) bad_divs += 1 elif div is not None: replace_nodes = get_replacement_nodes(doc, div) div.clear() for c, n in enumerate(replace_nodes): div.insert(c, n) else_divs += 1 return doc
def remove_paragraphs_with_few_words(top_node): """ Remove paragraphs that have less than x number of words, would indicate that it's some sort of link. """ all_nodes = parser.get_elements_by_tags(top_node, ['*']) all_nodes.reverse() for el in all_nodes: text = parser.get_text(el) stop_words = StopWords().get_stop_word_count(text) if stop_words.get_stop_word_count() < 3 \ and len(parser.get_elements_by_tag(el, tag='object')) == 0 \ and len(parser.get_elements_by_tag(el, tag='embed')) == 0: parser.remove(el) # TODO: Check if it is in the right place. else: trimmed = parser.get_text(el) if trimmed.startswith("(") and trimmed.endswith(")"): parser.remove(el)