def test_dragnet_model(self): params = {'b': 0.2, 'w': [0.4, -0.2, 0.9, 0.8, -0.3, -0.5]} block_model = LogisticRegression.load_model(params) mean_std = { 'mean': [0.0, 0.1, 0.2, 0.5, 0.0, 0.3], 'std': [1.0, 2.0, 0.5, 1.2, 0.75, 1.3] } koh_features = NormalizedFeature(kohlschuetter_features, mean_std) dm = ContentExtractionModel(Blockifier, [koh_features], block_model, threshold=0.5) content = dm.analyze(big_html_doc) # make prediction from individual components # to do so, we use kohlschuetter.make_features and LogisticRegression features, blocks = kohlschuetter.make_features(big_html_doc) nblocks = len(blocks) features_normalized = np.zeros(features.shape) for k in range(6): features_normalized[:, k] = (features[:, k] - mean_std['mean'][k]) / mean_std['std'][k] blocks_keep_indices = np.arange(nblocks)[ block_model.predict(features_normalized) > 0.5] actual_content = ' '.join( [blocks[index].text for index in blocks_keep_indices]) # check that the tokens are the same! self.assertEqual(re.split('\s+', actual_content.strip()), re.split('\s+', content.strip()))
def test_dragnet_model(self): params = {'b':0.2, 'w':[0.4, -0.2, 0.9, 0.8, -0.3, -0.5]} block_model = LogisticRegression.load_model(params) mean_std = {'mean':[0.0, 0.1, 0.2, 0.5, 0.0, 0.3], 'std':[1.0, 2.0, 0.5, 1.2, 0.75, 1.3]} koh_features = NormalizedFeature(kohlschuetter_features, mean_std) dm = ContentExtractionModel(Blockifier, [koh_features], block_model, threshold=0.5) content = dm.analyze(big_html_doc) # make prediction from individual components # to do so, we use kohlschuetter.make_features and LogisticRegression features, blocks = kohlschuetter.make_features(big_html_doc) nblocks = len(blocks) features_normalized = np.zeros(features.shape) for k in xrange(6): features_normalized[:, k] = (features[:, k] - mean_std['mean'][k]) / mean_std['std'][k] blocks_keep_indices = np.arange(nblocks)[block_model.predict(features_normalized) > 0.5] actual_content = ' '.join([blocks[index].text for index in blocks_keep_indices]) # check that the tokens are the same! self.assertEqual(re.split('\s+', actual_content.strip()), re.split('\s+', content.strip())) # check that we maintain backward compatability from dragnet import DragnetModelKohlschuetterFeatures dmkf = DragnetModelKohlschuetterFeatures(block_model, mean_std) content_dragnetmodelkohschuetterfeatures = dmkf.analyze(big_html_doc) self.assertEqual(re.split('\s+', actual_content.strip()), re.split('\s+', content_dragnetmodelkohschuetterfeatures.strip()))
def test_map_train_model(self): trainer = TrainModelCV([LogisticRegression, classification_error, '/tmp/logistic.json', (), {'lam':0.5}], X=self.X, y=self.y) errors = trainer.run() # load model trained_model = LogisticRegression.load_model('/tmp/logistic.json') loaded_model_error = classification_error(self.y, trained_model.predict(self.X)) # check the errors self.assertTrue(np.abs(errors[errors.keys()[0]]['train'] - 0.06) < 1e-12) self.assertTrue(np.abs(errors[errors.keys()[0]]['train'] - loaded_model_error) < 1e-12)
def test_map_train_model(self): trainer = TrainModelCV([ LogisticRegression, classification_error, '/tmp/logistic.json', (), { 'lam': 0.5 } ], X=self.X, y=self.y) errors = trainer.run() # load model trained_model = LogisticRegression.load_model('/tmp/logistic.json') loaded_model_error = classification_error( self.y, trained_model.predict(self.X)) # check the errors self.assertTrue( np.abs(errors[errors.keys()[0]]['train'] - 0.06) < 1e-12) self.assertTrue( np.abs(errors[errors.keys()[0]]['train'] - loaded_model_error) < 1e-12)
def test_dragnet_model(self): params = {"b": 0.2, "w": [0.4, -0.2, 0.9, 0.8, -0.3, -0.5]} block_model = LogisticRegression.load_model(params) mean_std = {"mean": [0.0, 0.1, 0.2, 0.5, 0.0, 0.3], "std": [1.0, 2.0, 0.5, 1.2, 0.75, 1.3]} koh_features = NormalizedFeature(kohlschuetter_features, mean_std) dm = ContentExtractionModel(Blockifier, [koh_features], block_model, threshold=0.5) content = dm.analyze(big_html_doc) # make prediction from individual components # to do so, we use kohlschuetter.make_features and LogisticRegression features, blocks = kohlschuetter.make_features(big_html_doc) nblocks = len(blocks) features_normalized = np.zeros(features.shape) for k in xrange(6): features_normalized[:, k] = (features[:, k] - mean_std["mean"][k]) / mean_std["std"][k] blocks_keep_indices = np.arange(nblocks)[block_model.predict(features_normalized) > 0.5] actual_content = " ".join([blocks[index].text for index in blocks_keep_indices]) # check that the tokens are the same! self.assertEqual(re.split("\s+", actual_content.strip()), re.split("\s+", content.strip()))