def _get_anchor_ratio(self, tag): try: text_length = len(remove_special_chars(tag.text)) anchors_length = sum( len(remove_special_chars(a.text)) for a in tag.find_all('a')) if anchors_length == 0: return 0 return round(anchors_length / float(text_length), 4) except AttributeError: return 0
def _get_punctuation_ratio(self, tag): try: tag_text = remove_special_chars(tag.text) words_count = len(tag_text.split()) punct_count = sum( tag_text.count(symbol) for symbol in ['.', ',', '!', '?', ':', ';']) if words_count == 0: return 0 return round(punct_count / float(words_count), 4) except AttributeError: return 0
def extract(self): body = BeautifulSoup(self.context.get('html'), 'html5lib').body body = self._remove_unwanted_tags(body) body = self._remove_comments(body) body = self._convert_elements_to_paragraph(body) body = self._label_tags_with_scores(body) body = self._choose_by_density(body) body = self._remove_by_score(body) body = self._remove_noisy_tags(body) body = self._remove_redundant_blocks(body) body = self._remove_unwanted_tags(body) body = self._clean_scores(body) body = self._clean_up_attributes(body) body = self._remove_title_from_text(body, self.context.get('title')) body = self._fix_image_paths(body) return remove_special_chars(str(body))
def simple_clean(html): body = BeautifulSoup(html, 'lxml').body for elem in body.select('script,style,link,source'): elem.extract() return remove_special_chars(str(body))
def extract_body_text_from_html(html): body = BeautifulSoup(html, 'lxml').body for elem in body.select('script,style,link,source'): elem.extract() return remove_special_chars(str(body.get_text()))
def test_removal_of_special_characters(text, expected): actual = remove_special_chars(text) assert actual == expected