Пример #1
0
    def test_num_of_links(self, soup, expected):

        extractor = DocumentObjectFeature(
            lp_url="https://felp_dummy_internal.com", soup=soup)

        num_of_links = extractor.get_num_of_links()
        assert num_of_links == expected
Пример #2
0
    def test_get_text_size_links_ratio(self, soup, expected):

        extractor = DocumentObjectFeature(
            lp_url="https://felp_dummy_internal.com", soup=soup)
        text_size_links_ratio = extractor.get_text_size_links_ratio()

        assert round(text_size_links_ratio, 2) == expected
Пример #3
0
    def test_number_of_external_total_links_ratio(self, soup, expected):

        extractor = DocumentObjectFeature(
            lp_url="https://felp_dummy_internal.com", soup=soup)

        number_of_external_total_links_ratio = (
            extractor.get_number_of_external_total_links_ratio())
        assert round(number_of_external_total_links_ratio, 2) == expected
Пример #4
0
    def test_get_links(self, soup, expected_internal, expected_external):

        extractor = DocumentObjectFeature(
            lp_url="https://felp_dummy_internal.com", soup=soup)

        links = extractor.get_links()

        assert len(links.internal) == expected_internal
        assert len(links.external) == expected_external
Пример #5
0
    def test_get_main_text(self, soup, expected_main_text_num):

        extractor = DocumentObjectFeature(
            lp_url="https://felp_dummy_internal.com", soup=soup)
        main_text = extractor.get_main_text()

        expected_path = self.FIXTURES_ROOT / expected_main_text_num / "main_text.txt"
        with expected_path.open("r") as rf:
            expected_text = rf.read()

        assert main_text == expected_text
Пример #6
0
 def test_get_external_links(self, soup, expected):
     extractor = DocumentObjectFeature(
         lp_url="https://felp_dummy_internal.com", soup=soup)
     external_links = extractor.get_external_links()
     print(external_links)
     raise NotImplementedError
Пример #7
0
 def __init__(self, lp_url: str, soup: BeautifulSoup) -> None:
     self._soup = soup
     self._doc_feat_extractor = DocumentObjectFeature(lp_url, soup)
Пример #8
0
class ReadabilityFeature(object):
    def __init__(self, lp_url: str, soup: BeautifulSoup) -> None:
        self._soup = soup
        self._doc_feat_extractor = DocumentObjectFeature(lp_url, soup)

    def get_main_total_text_size_ratio(self) -> float:
        """
        Main text (without boilerplate text) per total links ratio
        """
        main_text_size = len(self._doc_feat_extractor.get_main_text())

        internal_links = self._doc_feat_extractor.get_internal_links()
        internal_link_text_size = sum(
            [len(link.text) for link in internal_links])

        external_links = self._doc_feat_extractor.get_external_links()
        external_link_text_size = sum(
            [len(link.text) for link in external_links])

        return (internal_link_text_size +
                external_link_text_size) / main_text_size

    def get_total_text_size(self) -> float:
        """
        Text size
        """
        return len(self._soup.get_text(strip=True))

    def get_main_text_size(self) -> float:
        """
        Main text size
        """
        return len(self._doc_feat_extractor.get_main_text())

    def get_flash_kincaid_title_readability(self) -> float:
        """
        Readability of the title
        """
        # title = self._soup.find("title")
        raise NotImplementedError

    def get_flash_kincaid_abstract_readability(self) -> float:
        """
        Readability of the abstract
        """
        raise NotImplementedError

    def get_token_count(self) -> float:
        """
        Number of tokens
        """
        breakpoint()
        raise NotImplementedError

    def get_summarizability_score(self) -> float:
        """
        Summarizability of the text
        """
        raise NotImplementedError

    def get_flash_kincaid_main_text_readability(self) -> float:
        """
        Readability of the main text
        """
        raise NotImplementedError