def test_get_from_idx(self): soup = Sauce.from_file(self.args['testfile_1']) tags = list(soup.find_all()) self.assertEqual(tags[0], soup.get_from_idx([0])) self.assertEqual(tags[2], soup.get_from_idx([0, 0, 0])) self.assertEqual(tags[7], soup.get_from_idx([0, 1, 3])) soup = Sauce.from_file(self.args['testfile_2']) tags = list(soup.find_all()) self.assertEqual(tags[0], soup.get_from_idx([0])) self.assertEqual(tags[5], soup.get_from_idx([0, 1, 1])) self.assertEqual(tags[11], soup.get_from_idx([0, 1, 2, 0])) self.assertEqual(tags[13], soup.get_from_idx([0, 1, 4]))
def test_assign_idxs(self): sauce = Sauce.from_file(self.args['testfile_1']) tags = list(sauce.find_all()) self.assertEqual(tags[0].idx, [0]) self.assertEqual(tags[2].idx, [0, 0, 0]) self.assertEqual(tags[7].idx, [0, 1, 3]) sauce = Sauce.from_file(self.args['testfile_2']) tags = list(sauce.find_all()) self.assertEqual(tags[0].idx, [0]) self.assertEqual(tags[5].idx, [0, 1, 1]) self.assertEqual(tags[11].idx, [0, 1, 2, 0]) self.assertEqual(tags[13].idx, [0, 1, 4])
def test_to_dataframe(self): ftrs = Featurizer() soup = Sauce.from_file(self.args['testfile_1']) @ftrs.add_categorical_feature("tag_name") def f_tag_name(tag): return tag.name @ftrs.add_numerical_feature('is_bold') def f_is_bold(tag): if tag.css_attrs['font-weight'] == 'bold': return True else: return False @ftrs.add_text_feature('text') def f_text(tag): if tag.name in ['head', 'meta', 'script']: return "" texts = list(tag.find_all(text=True, recursive=False)) if len(texts) < 1: return "" texts = " ".join(texts).strip() texts = re.sub("\n", " ", texts) return texts ftrs.featurize(soup) df = ftrs.to_dataframe(soup) filtered = df[df.text.str.contains('bold')] self.assertEqual(filtered.is_bold.sum(), len(filtered)) filtered = df[df.text.str.contains('normal')] self.assertEqual(filtered.is_bold.sum(), 0)
def test_featurize(self): ftrs = Featurizer() soup = Sauce.from_file(self.args['testfile_1']) @ftrs.add_categorical_feature("tag_name") def f_tag_name(tag): return tag.name @ftrs.add_numerical_feature('is_bold') def f_is_bold(tag): if tag.css_attrs['font-weight'] == 'bold': return True else: return False @ftrs.add_text_feature('text') def f_text(tag): if tag.name in ['head', 'meta', 'script']: return "" texts = list(tag.find_all(text=True, recursive=False)) if len(texts) < 1: return "" texts = " ".join(texts).strip() texts = re.sub("\n", " ", texts) return texts ftrs.featurize(soup)
def test_from_file(self): with open(self.args['testfile_1'], "r") as f: soup = bs(f, 'lxml') sauce = Sauce.from_file(self.args['testfile_1']) self.assertEqual(soup, sauce)
def test_from_url(self): url = "https://en.wikipedia.org/wiki/Grace_Hopper" r = requests.get(url) content = r.text soup = bs(content, 'lxml') sauce = Sauce.from_url(url) self.assertEqual(soup, sauce)
def test_init(self): with open(self.args['testfile_1'], "r") as f: soup = bs(f, 'lxml') with open(self.args['testfile_1'], "r") as f: sauce = Sauce(f) self.assertEqual(soup, sauce)
def test_dummify(self): ftrs = Featurizer() soup = Sauce.from_file(self.args['testfile_1']) @ftrs.add_categorical_feature("tag_name") def f_tag_name(tag): return tag.name ftrs.featurize(soup) df = ftrs.to_dataframe(soup, normalize=True) self.assertEqual(df['tag_name_div'].sum(), 6) self.assertEqual(df['tag_name_html'].sum(), 1) self.assertEqual(df['tag_name_p'].sum(), 2)
def test_normalize(self): ftrs = Featurizer() soup = Sauce.from_file(self.args['testfile_1']) @ftrs.add_numerical_feature('char_cnt') def f_text(tag): if tag.name in ['head', 'meta', 'script']: return 0 texts = list(tag.find_all(text=True, recursive=False)) if len(texts) < 1: return 0 texts = " ".join(texts).strip() texts = re.sub("\n", " ", texts) return len(texts) ftrs.featurize(soup) df = ftrs.to_dataframe(soup, normalize=True) self.assertEqual(df['char_cnt'].max(), 1.) self.assertEqual(df['char_cnt'].min(), 0.)