def test_fit(self): from mozsci.evaluation import classification_error np.random.seed(5) N = int(1e5) x = np.random.rand(N, 2) y = (3 * x[:, 0] - 2 * x[:, 1] - 1.5 > 0.0).astype(np.int) lr = LogisticRegression() lr.fit(x, y, factr=1e4) ypred = lr.predict(x) self.assertTrue(classification_error(y, ypred) < 0.002)
def test_error_gradient(self): x0 = np.array([self.x[0]]) x1 = np.array([self.x[1]]) error, gradient = LogisticRegression._loss_gradient( x0, x1, self.b, self.w, self.lam) # this assumes test_sigmoid pases err_act = -np.log(LogisticRegression._sigmoid( x1, self.b, self.w)) - np.log(1.0 - LogisticRegression._sigmoid( x0, self.b, self.w)) + 0.5 * 7 * 10 pred_error = LogisticRegression._sigmoid(self.x, self.b, self.w) - self.t gradient_act = np.array([0.0, 7 * 3, 7 * -1]) gradient_act[0] = np.sum(pred_error) gradient_act[1] += np.sum(pred_error * self.x[:, 0]) gradient_act[2] += np.sum(pred_error * self.x[:, 1]) self.assertTrue(abs(float(err_act) - error) < 1.0e-12) self.assertTrue(np.all(np.abs(gradient - gradient_act) < 1.0e-12)) # weighted case x00 = np.array([self.x[0], [55, -2]]) error_weighted, gradient_weighted = LogisticRegression._loss_gradient( x00, x1, self.b, self.w, self.lam, [np.array([0.4, 0.75]), np.array(0.35)]) err_weighted_act = -np.log( LogisticRegression._sigmoid(x1, self.b, self.w)) * 0.35 - np.log( 1.0 - LogisticRegression._sigmoid(x0, self.b, self.w) ) * 0.4 - np.log(1.0 - LogisticRegression._sigmoid( [x00[1, :]], self.b, self.w)) * 0.75 + 0.5 * 7 * 10 self.assertTrue( abs(float(err_weighted_act) - error_weighted) < 1.0e-12)
def test_dragnet_model(self): params = {'b': 0.2, 'w': [0.4, -0.2, 0.9, 0.8, -0.3, -0.5]} block_model = LogisticRegression.load_model(params) mean_std = { 'mean': [0.0, 0.1, 0.2, 0.5, 0.0, 0.3], 'std': [1.0, 2.0, 0.5, 1.2, 0.75, 1.3] } koh_features = NormalizedFeature(kohlschuetter_features, mean_std) dm = ContentExtractionModel(Blockifier, [koh_features], block_model, threshold=0.5) content = dm.analyze(big_html_doc) # make prediction from individual components # to do so, we use kohlschuetter.make_features and LogisticRegression features, blocks = kohlschuetter.make_features(big_html_doc) nblocks = len(blocks) features_normalized = np.zeros(features.shape) for k in range(6): features_normalized[:, k] = (features[:, k] - mean_std['mean'][k]) / mean_std['std'][k] blocks_keep_indices = np.arange(nblocks)[ block_model.predict(features_normalized) > 0.5] actual_content = ' '.join( [blocks[index].text for index in blocks_keep_indices]) # check that the tokens are the same! self.assertEqual(re.split('\s+', actual_content.strip()), re.split('\s+', content.strip()))
def test_dragnet_model(self): params = {'b':0.2, 'w':[0.4, -0.2, 0.9, 0.8, -0.3, -0.5]} block_model = LogisticRegression.load_model(params) mean_std = {'mean':[0.0, 0.1, 0.2, 0.5, 0.0, 0.3], 'std':[1.0, 2.0, 0.5, 1.2, 0.75, 1.3]} koh_features = NormalizedFeature(kohlschuetter_features, mean_std) dm = ContentExtractionModel(Blockifier, [koh_features], block_model, threshold=0.5) content = dm.analyze(big_html_doc) # make prediction from individual components # to do so, we use kohlschuetter.make_features and LogisticRegression features, blocks = kohlschuetter.make_features(big_html_doc) nblocks = len(blocks) features_normalized = np.zeros(features.shape) for k in xrange(6): features_normalized[:, k] = (features[:, k] - mean_std['mean'][k]) / mean_std['std'][k] blocks_keep_indices = np.arange(nblocks)[block_model.predict(features_normalized) > 0.5] actual_content = ' '.join([blocks[index].text for index in blocks_keep_indices]) # check that the tokens are the same! self.assertEqual(re.split('\s+', actual_content.strip()), re.split('\s+', content.strip())) # check that we maintain backward compatability from dragnet import DragnetModelKohlschuetterFeatures dmkf = DragnetModelKohlschuetterFeatures(block_model, mean_std) content_dragnetmodelkohschuetterfeatures = dmkf.analyze(big_html_doc) self.assertEqual(re.split('\s+', actual_content.strip()), re.split('\s+', content_dragnetmodelkohschuetterfeatures.strip()))
def test_map_train_model(self): trainer = TrainModelCV([LogisticRegression, classification_error, '/tmp/logistic.json', (), {'lam':0.5}], X=self.X, y=self.y) errors = trainer.run() # load model trained_model = LogisticRegression.load_model('/tmp/logistic.json') loaded_model_error = classification_error(self.y, trained_model.predict(self.X)) # check the errors self.assertTrue(np.abs(errors[errors.keys()[0]]['train'] - 0.06) < 1e-12) self.assertTrue(np.abs(errors[errors.keys()[0]]['train'] - loaded_model_error) < 1e-12)
def test_error_gradient(self): x0 = np.array([self.x[0]]) x1 = np.array([self.x[1]]) error, gradient = LogisticRegression._loss_gradient(x0, x1, self.b, self.w, self.lam) # this assumes test_sigmoid pases err_act = -np.log(LogisticRegression._sigmoid(x1, self.b, self.w)) - np.log(1.0 - LogisticRegression._sigmoid(x0, self.b, self.w)) + 0.5 * 7 * 10 pred_error = LogisticRegression._sigmoid(self.x, self.b, self.w) - self.t gradient_act = np.array([0.0, 7 * 3, 7 * -1]) gradient_act[0] = np.sum(pred_error) gradient_act[1] += np.sum(pred_error * self.x[:, 0]) gradient_act[2] += np.sum(pred_error * self.x[:, 1]) self.assertTrue( abs(float(err_act) - error) < 1.0e-12 ) self.assertTrue(np.all(np.abs(gradient - gradient_act) < 1.0e-12)) # weighted case x00 = np.array([self.x[0], [55, -2]]) error_weighted, gradient_weighted = LogisticRegression._loss_gradient(x00, x1, self.b, self.w, self.lam, [np.array([0.4, 0.75]), np.array(0.35)]) err_weighted_act = -np.log(LogisticRegression._sigmoid(x1, self.b, self.w)) * 0.35 - np.log(1.0 - LogisticRegression._sigmoid(x0, self.b, self.w)) * 0.4 - np.log(1.0 - LogisticRegression._sigmoid([x00[1, :]], self.b, self.w)) * 0.75 + 0.5 * 7 * 10 self.assertTrue( abs(float(err_weighted_act) - error_weighted) < 1.0e-12 )
def test_map_train_model(self): trainer = TrainModelCV([ LogisticRegression, classification_error, '/tmp/logistic.json', (), { 'lam': 0.5 } ], X=self.X, y=self.y) errors = trainer.run() # load model trained_model = LogisticRegression.load_model('/tmp/logistic.json') loaded_model_error = classification_error( self.y, trained_model.predict(self.X)) # check the errors self.assertTrue( np.abs(errors[errors.keys()[0]]['train'] - 0.06) < 1e-12) self.assertTrue( np.abs(errors[errors.keys()[0]]['train'] - loaded_model_error) < 1e-12)
def test_dragnet_model(self): params = {"b": 0.2, "w": [0.4, -0.2, 0.9, 0.8, -0.3, -0.5]} block_model = LogisticRegression.load_model(params) mean_std = {"mean": [0.0, 0.1, 0.2, 0.5, 0.0, 0.3], "std": [1.0, 2.0, 0.5, 1.2, 0.75, 1.3]} koh_features = NormalizedFeature(kohlschuetter_features, mean_std) dm = ContentExtractionModel(Blockifier, [koh_features], block_model, threshold=0.5) content = dm.analyze(big_html_doc) # make prediction from individual components # to do so, we use kohlschuetter.make_features and LogisticRegression features, blocks = kohlschuetter.make_features(big_html_doc) nblocks = len(blocks) features_normalized = np.zeros(features.shape) for k in xrange(6): features_normalized[:, k] = (features[:, k] - mean_std["mean"][k]) / mean_std["std"][k] blocks_keep_indices = np.arange(nblocks)[block_model.predict(features_normalized) > 0.5] actual_content = " ".join([blocks[index].text for index in blocks_keep_indices]) # check that the tokens are the same! self.assertEqual(re.split("\s+", actual_content.strip()), re.split("\s+", content.strip()))
def test_sigmoid(self): y = LogisticRegression._sigmoid(self.x, self.b, self.w) yact = np.array([1.0 / (1.0 + np.exp(-6)), 1.0 / (1.0 + np.exp(-1.5))]) self.assertTrue(np.all(np.abs(y - yact) < 1.0e-12))
def train_models(datadir, output_prefix, features_to_use, lam=10): """Train a content extraction model. Does feature centering, trains the logistic regression model, pickles the final model and writes the train/test block level errors to a file datadir = root directory for all the data output_prefix = write the trained model files, errors, etc to files starting with this. features_to_use = a list of the features to use. Must be one of the features known by AllFeatures lambda = lambda regularization parameter for LogisticRegression """ import pprint import pickle from . import AllFeatures from .blocks import TagCountBlockifier as Blkr from mozsci.models import LogisticRegression from mozsci.numpy_util import NumpyEncoder # assemble the features feature_instances = [] for f in features_to_use: feature_instances.append(AllFeatures.get(f)) # compute the mean/std and save them data = DragnetModelData(datadir) trainer = DragnetModelTrainer() print "Initializing features" k = 0 for f in feature_instances: # check to see if this feature needs to be init # if so, then init it, take the return object and serialize to json if hasattr(f, 'init_params'): # initialize it model_init = ContentExtractionModel(Blkr, [f], None) features, labels, weights = trainer.make_features_from_data(data, model_init, train=True) mean_std = f.init_params(features) f.set_params(mean_std) with open("%s_mean_std_%s.json" % (output_prefix, features_to_use[k]), 'w') as fout: fout.write("%s" % json.dumps(mean_std, cls=NumpyEncoder)) k += 1 model_to_train = ContentExtractionModel(Blkr, feature_instances, None) # train the model print "Training the model" model = LogisticRegression(lam=lam) features, labels, weights = trainer.make_features_from_data(data, model_to_train, training_or_test='training') model.fit(features, labels, weights=np.minimum(weights, 200)) print "Checking errors" train_errors = accuracy_auc(labels, model.predict(features), weights=weights) # check errors on test set test_features, test_labels, test_weights = trainer.make_features_from_data(data, model_to_train, training_or_test='test') test_weights = np.minimum(test_weights, 200.0) test_errors = accuracy_auc(test_labels, model.predict(test_features), weights=test_weights) # write errors to a file with open(output_prefix + '_block_errors.txt', 'w') as f: f.write("Training errors for final model (block level):\n") pprint.pprint(train_errors, f) f.write("Test errors (block level):\n") pprint.pprint(test_errors, f) # pickle the final model! # use the one with threshold = 0.5 pickle.dump(ContentExtractionModel(Blkr, feature_instances, model, threshold=0.5), open(output_prefix + '_content_model.pickle', 'w')) print "done!"