def test_small_doc(self): self.assertEqual((None, []), kohlschuetter.make_features('<html></html>')) self.assertEqual('', kohlschuetter.analyze('<html></html>')) s = '<html> <p>a</p> <div>b</div> </html>' features, blocks = kohlschuetter.make_features(s) self.assertTrue(features is None) self.block_output_tokens(blocks, [['a'], ['b']]) self.assertEqual('a b', kohlschuetter.analyze(s))
def test_dragnet_model(self): params = {'b': 0.2, 'w': [0.4, -0.2, 0.9, 0.8, -0.3, -0.5]} block_model = LogisticRegression.load_model(params) mean_std = { 'mean': [0.0, 0.1, 0.2, 0.5, 0.0, 0.3], 'std': [1.0, 2.0, 0.5, 1.2, 0.75, 1.3] } koh_features = NormalizedFeature(kohlschuetter_features, mean_std) dm = ContentExtractionModel(Blockifier, [koh_features], block_model, threshold=0.5) content = dm.analyze(big_html_doc) # make prediction from individual components # to do so, we use kohlschuetter.make_features and LogisticRegression features, blocks = kohlschuetter.make_features(big_html_doc) nblocks = len(blocks) features_normalized = np.zeros(features.shape) for k in range(6): features_normalized[:, k] = (features[:, k] - mean_std['mean'][k]) / mean_std['std'][k] blocks_keep_indices = np.arange(nblocks)[ block_model.predict(features_normalized) > 0.5] actual_content = ' '.join( [blocks[index].text for index in blocks_keep_indices]) # check that the tokens are the same! self.assertEqual(re.split('\s+', actual_content.strip()), re.split('\s+', content.strip()))
def test_dragnet_model(self): params = {'b':0.2, 'w':[0.4, -0.2, 0.9, 0.8, -0.3, -0.5]} block_model = LogisticRegression.load_model(params) mean_std = {'mean':[0.0, 0.1, 0.2, 0.5, 0.0, 0.3], 'std':[1.0, 2.0, 0.5, 1.2, 0.75, 1.3]} koh_features = NormalizedFeature(kohlschuetter_features, mean_std) dm = ContentExtractionModel(Blockifier, [koh_features], block_model, threshold=0.5) content = dm.analyze(big_html_doc) # make prediction from individual components # to do so, we use kohlschuetter.make_features and LogisticRegression features, blocks = kohlschuetter.make_features(big_html_doc) nblocks = len(blocks) features_normalized = np.zeros(features.shape) for k in xrange(6): features_normalized[:, k] = (features[:, k] - mean_std['mean'][k]) / mean_std['std'][k] blocks_keep_indices = np.arange(nblocks)[block_model.predict(features_normalized) > 0.5] actual_content = ' '.join([blocks[index].text for index in blocks_keep_indices]) # check that the tokens are the same! self.assertEqual(re.split('\s+', actual_content.strip()), re.split('\s+', content.strip())) # check that we maintain backward compatability from dragnet import DragnetModelKohlschuetterFeatures dmkf = DragnetModelKohlschuetterFeatures(block_model, mean_std) content_dragnetmodelkohschuetterfeatures = dmkf.analyze(big_html_doc) self.assertEqual(re.split('\s+', actual_content.strip()), re.split('\s+', content_dragnetmodelkohschuetterfeatures.strip()))
def test_make_features(self): s = '<html> <p>first </p> <div> <p>second block with <a href=' '>anchor</a> </p> <p>the third block</p> </div> </html>' features, blocks = kohlschuetter.make_features(s) self.block_output_tokens( blocks, [['first'], ['second', 'block', 'with', 'anchor'], ['the', 'third', 'block']]) self.link_output_tokens(blocks, [[], ['anchor'], []]) text_density = [1.0, 4.0, 3.0] link_density = [1.0, 0.25, 1.0 / 3.0] self.assertTrue( np.allclose(features[0, :], [ 0.0, 0.0, link_density[0], text_density[0], link_density[1], text_density[1] ])) self.assertTrue( np.allclose(features[1, :], [ link_density[0], text_density[0], link_density[1], text_density[1], link_density[2], text_density[2] ])) self.assertTrue( np.allclose(features[2, :], [ link_density[1], text_density[1], link_density[2], text_density[2], 0.0, 0.0 ]))
def test_make_features(self): s = '<html> <p>first </p> <div> <p>second block with <a href=''>anchor</a> </p> <p>the third block</p> </div> </html>' features, blocks = kohlschuetter.make_features(s) self.block_output_tokens(blocks, [['first'], ['second', 'block', 'with', 'anchor'], ['the', 'third', 'block']]) self.link_output_tokens(blocks, [[], ['anchor'], []]) text_density = [1.0, 4.0, 3.0] link_density = [1.0, 0.25, 1.0 / 3.0] self.assertTrue(np.allclose(features[0, :], [0.0, 0.0, link_density[0], text_density[0], link_density[1], text_density[1]])) self.assertTrue(np.allclose(features[1, :], [link_density[0], text_density[0], link_density[1], text_density[1], link_density[2], text_density[2]])) self.assertTrue(np.allclose(features[2, :], [link_density[1], text_density[1], link_density[2], text_density[2], 0.0, 0.0]))
def test_dragnet_model(self): params = {"b": 0.2, "w": [0.4, -0.2, 0.9, 0.8, -0.3, -0.5]} block_model = LogisticRegression.load_model(params) mean_std = {"mean": [0.0, 0.1, 0.2, 0.5, 0.0, 0.3], "std": [1.0, 2.0, 0.5, 1.2, 0.75, 1.3]} koh_features = NormalizedFeature(kohlschuetter_features, mean_std) dm = ContentExtractionModel(Blockifier, [koh_features], block_model, threshold=0.5) content = dm.analyze(big_html_doc) # make prediction from individual components # to do so, we use kohlschuetter.make_features and LogisticRegression features, blocks = kohlschuetter.make_features(big_html_doc) nblocks = len(blocks) features_normalized = np.zeros(features.shape) for k in xrange(6): features_normalized[:, k] = (features[:, k] - mean_std["mean"][k]) / mean_std["std"][k] blocks_keep_indices = np.arange(nblocks)[block_model.predict(features_normalized) > 0.5] actual_content = " ".join([blocks[index].text for index in blocks_keep_indices]) # check that the tokens are the same! self.assertEqual(re.split("\s+", actual_content.strip()), re.split("\s+", content.strip()))