예제 #1
0
    def _get_anchor_ratio(self, tag):
        try:
            text_length = len(remove_special_chars(tag.text))
            anchors_length = sum(
                len(remove_special_chars(a.text)) for a in tag.find_all('a'))

            if anchors_length == 0:
                return 0
            return round(anchors_length / float(text_length), 4)
        except AttributeError:
            return 0
예제 #2
0
    def _get_punctuation_ratio(self, tag):
        try:
            tag_text = remove_special_chars(tag.text)

            words_count = len(tag_text.split())
            punct_count = sum(
                tag_text.count(symbol)
                for symbol in ['.', ',', '!', '?', ':', ';'])

            if words_count == 0:
                return 0
            return round(punct_count / float(words_count), 4)
        except AttributeError:
            return 0
예제 #3
0
 def extract(self):
     body = BeautifulSoup(self.context.get('html'), 'html5lib').body
     body = self._remove_unwanted_tags(body)
     body = self._remove_comments(body)
     body = self._convert_elements_to_paragraph(body)
     body = self._label_tags_with_scores(body)
     body = self._choose_by_density(body)
     body = self._remove_by_score(body)
     body = self._remove_noisy_tags(body)
     body = self._remove_redundant_blocks(body)
     body = self._remove_unwanted_tags(body)
     body = self._clean_scores(body)
     body = self._clean_up_attributes(body)
     body = self._remove_title_from_text(body, self.context.get('title'))
     body = self._fix_image_paths(body)
     return remove_special_chars(str(body))
예제 #4
0
def simple_clean(html):
    body = BeautifulSoup(html, 'lxml').body
    for elem in body.select('script,style,link,source'):
        elem.extract()
    return remove_special_chars(str(body))
예제 #5
0
def extract_body_text_from_html(html):
    body = BeautifulSoup(html, 'lxml').body
    for elem in body.select('script,style,link,source'):
        elem.extract()
    return remove_special_chars(str(body.get_text()))
예제 #6
0
def test_removal_of_special_characters(text, expected):
    actual = remove_special_chars(text)
    assert actual == expected