示例#1
0
    def test_get_from_idx(self):
        soup = Sauce.from_file(self.args['testfile_1'])
        tags = list(soup.find_all())
        self.assertEqual(tags[0], soup.get_from_idx([0]))
        self.assertEqual(tags[2], soup.get_from_idx([0, 0, 0]))
        self.assertEqual(tags[7], soup.get_from_idx([0, 1, 3]))

        soup = Sauce.from_file(self.args['testfile_2'])
        tags = list(soup.find_all())
        self.assertEqual(tags[0], soup.get_from_idx([0]))
        self.assertEqual(tags[5], soup.get_from_idx([0, 1, 1]))
        self.assertEqual(tags[11], soup.get_from_idx([0, 1, 2, 0]))
        self.assertEqual(tags[13], soup.get_from_idx([0, 1, 4]))
示例#2
0
    def test_assign_idxs(self):
        sauce = Sauce.from_file(self.args['testfile_1'])
        tags = list(sauce.find_all())
        self.assertEqual(tags[0].idx, [0])
        self.assertEqual(tags[2].idx, [0, 0, 0])
        self.assertEqual(tags[7].idx, [0, 1, 3])

        sauce = Sauce.from_file(self.args['testfile_2'])
        tags = list(sauce.find_all())
        self.assertEqual(tags[0].idx, [0])
        self.assertEqual(tags[5].idx, [0, 1, 1])
        self.assertEqual(tags[11].idx, [0, 1, 2, 0])
        self.assertEqual(tags[13].idx, [0, 1, 4])
示例#3
0
    def test_to_dataframe(self):
        ftrs = Featurizer()
        soup = Sauce.from_file(self.args['testfile_1'])

        @ftrs.add_categorical_feature("tag_name")
        def f_tag_name(tag):
            return tag.name

        @ftrs.add_numerical_feature('is_bold')
        def f_is_bold(tag):
            if tag.css_attrs['font-weight'] == 'bold':
                return True
            else:
                return False

        @ftrs.add_text_feature('text')
        def f_text(tag):
            if tag.name in ['head', 'meta', 'script']:
                return ""
            texts = list(tag.find_all(text=True, recursive=False))
            if len(texts) < 1:
                return ""
            texts = " ".join(texts).strip()
            texts = re.sub("\n", " ", texts)
            return texts

        ftrs.featurize(soup)
        df = ftrs.to_dataframe(soup)

        filtered = df[df.text.str.contains('bold')]
        self.assertEqual(filtered.is_bold.sum(),
                         len(filtered))

        filtered = df[df.text.str.contains('normal')]
        self.assertEqual(filtered.is_bold.sum(), 0)
示例#4
0
    def test_featurize(self):
        ftrs = Featurizer()
        soup = Sauce.from_file(self.args['testfile_1'])

        @ftrs.add_categorical_feature("tag_name")
        def f_tag_name(tag):
            return tag.name

        @ftrs.add_numerical_feature('is_bold')
        def f_is_bold(tag):
            if tag.css_attrs['font-weight'] == 'bold':
                return True
            else:
                return False

        @ftrs.add_text_feature('text')
        def f_text(tag):
            if tag.name in ['head', 'meta', 'script']:
                return ""
            texts = list(tag.find_all(text=True, recursive=False))
            if len(texts) < 1:
                return ""
            texts = " ".join(texts).strip()
            texts = re.sub("\n", " ", texts)
            return texts

        ftrs.featurize(soup)
示例#5
0
    def test_from_file(self):

        with open(self.args['testfile_1'], "r") as f:
            soup = bs(f, 'lxml')

        sauce = Sauce.from_file(self.args['testfile_1'])
        self.assertEqual(soup, sauce)
示例#6
0
    def test_from_url(self):
        url = "https://en.wikipedia.org/wiki/Grace_Hopper"
        r = requests.get(url)
        content = r.text
        soup = bs(content, 'lxml')

        sauce = Sauce.from_url(url)
        self.assertEqual(soup, sauce)
示例#7
0
    def test_init(self):
        with open(self.args['testfile_1'], "r") as f:
            soup = bs(f, 'lxml')

        with open(self.args['testfile_1'], "r") as f:
            sauce = Sauce(f)

        self.assertEqual(soup, sauce)
示例#8
0
    def test_dummify(self):
        ftrs = Featurizer()
        soup = Sauce.from_file(self.args['testfile_1'])

        @ftrs.add_categorical_feature("tag_name")
        def f_tag_name(tag):
            return tag.name

        ftrs.featurize(soup)
        df = ftrs.to_dataframe(soup, normalize=True)

        self.assertEqual(df['tag_name_div'].sum(), 6)
        self.assertEqual(df['tag_name_html'].sum(), 1)
        self.assertEqual(df['tag_name_p'].sum(), 2)
示例#9
0
    def test_normalize(self):
        ftrs = Featurizer()
        soup = Sauce.from_file(self.args['testfile_1'])

        @ftrs.add_numerical_feature('char_cnt')
        def f_text(tag):
            if tag.name in ['head', 'meta', 'script']:
                return 0
            texts = list(tag.find_all(text=True, recursive=False))
            if len(texts) < 1:
                return 0
            texts = " ".join(texts).strip()
            texts = re.sub("\n", " ", texts)
            return len(texts)

        ftrs.featurize(soup)
        df = ftrs.to_dataframe(soup, normalize=True)
        self.assertEqual(df['char_cnt'].max(), 1.)
        self.assertEqual(df['char_cnt'].min(), 0.)