def test_fit(self):
     from mozsci.evaluation import classification_error
     np.random.seed(5)
     N = int(1e5)
     x = np.random.rand(N, 2)
     y = (3 * x[:, 0] - 2 * x[:, 1] - 1.5 > 0.0).astype(np.int)
     lr = LogisticRegression()
     lr.fit(x, y, factr=1e4)
     ypred = lr.predict(x)
     self.assertTrue(classification_error(y, ypred) < 0.002)
示例#2
0
 def test_fit(self):
     from mozsci.evaluation import classification_error
     np.random.seed(5)
     N = int(1e5)
     x = np.random.rand(N, 2)
     y = (3 * x[:, 0] - 2 * x[:, 1] - 1.5 > 0.0).astype(np.int)
     lr = LogisticRegression()
     lr.fit(x, y, factr=1e4)
     ypred = lr.predict(x)
     self.assertTrue(classification_error(y, ypred) < 0.002)
示例#3
0
    def test_error_gradient(self):
        x0 = np.array([self.x[0]])
        x1 = np.array([self.x[1]])
        error, gradient = LogisticRegression._loss_gradient(
            x0, x1, self.b, self.w, self.lam)

        # this assumes test_sigmoid pases
        err_act = -np.log(LogisticRegression._sigmoid(
            x1, self.b, self.w)) - np.log(1.0 - LogisticRegression._sigmoid(
                x0, self.b, self.w)) + 0.5 * 7 * 10

        pred_error = LogisticRegression._sigmoid(self.x, self.b,
                                                 self.w) - self.t
        gradient_act = np.array([0.0, 7 * 3, 7 * -1])
        gradient_act[0] = np.sum(pred_error)
        gradient_act[1] += np.sum(pred_error * self.x[:, 0])
        gradient_act[2] += np.sum(pred_error * self.x[:, 1])

        self.assertTrue(abs(float(err_act) - error) < 1.0e-12)
        self.assertTrue(np.all(np.abs(gradient - gradient_act) < 1.0e-12))

        # weighted case
        x00 = np.array([self.x[0], [55, -2]])
        error_weighted, gradient_weighted = LogisticRegression._loss_gradient(
            x00, x1, self.b, self.w, self.lam,
            [np.array([0.4, 0.75]), np.array(0.35)])
        err_weighted_act = -np.log(
            LogisticRegression._sigmoid(x1, self.b, self.w)) * 0.35 - np.log(
                1.0 - LogisticRegression._sigmoid(x0, self.b, self.w)
            ) * 0.4 - np.log(1.0 - LogisticRegression._sigmoid(
                [x00[1, :]], self.b, self.w)) * 0.75 + 0.5 * 7 * 10
        self.assertTrue(
            abs(float(err_weighted_act) - error_weighted) < 1.0e-12)
    def test_dragnet_model(self):
        params = {'b': 0.2, 'w': [0.4, -0.2, 0.9, 0.8, -0.3, -0.5]}
        block_model = LogisticRegression.load_model(params)
        mean_std = {
            'mean': [0.0, 0.1, 0.2, 0.5, 0.0, 0.3],
            'std': [1.0, 2.0, 0.5, 1.2, 0.75, 1.3]
        }
        koh_features = NormalizedFeature(kohlschuetter_features, mean_std)

        dm = ContentExtractionModel(Blockifier, [koh_features],
                                    block_model,
                                    threshold=0.5)
        content = dm.analyze(big_html_doc)

        # make prediction from individual components
        # to do so, we use kohlschuetter.make_features and LogisticRegression
        features, blocks = kohlschuetter.make_features(big_html_doc)
        nblocks = len(blocks)
        features_normalized = np.zeros(features.shape)
        for k in range(6):
            features_normalized[:,
                                k] = (features[:, k] -
                                      mean_std['mean'][k]) / mean_std['std'][k]
        blocks_keep_indices = np.arange(nblocks)[
            block_model.predict(features_normalized) > 0.5]

        actual_content = ' '.join(
            [blocks[index].text for index in blocks_keep_indices])

        # check that the tokens are the same!
        self.assertEqual(re.split('\s+', actual_content.strip()),
                         re.split('\s+', content.strip()))
    def test_dragnet_model(self):
        params = {'b':0.2, 'w':[0.4, -0.2, 0.9, 0.8, -0.3, -0.5]}
        block_model = LogisticRegression.load_model(params)
        mean_std = {'mean':[0.0, 0.1, 0.2, 0.5, 0.0, 0.3], 'std':[1.0, 2.0, 0.5, 1.2, 0.75, 1.3]}
        koh_features = NormalizedFeature(kohlschuetter_features, mean_std)

        dm = ContentExtractionModel(Blockifier, [koh_features], block_model, threshold=0.5)
        content = dm.analyze(big_html_doc)

        # make prediction from individual components
        # to do so, we use kohlschuetter.make_features and LogisticRegression
        features, blocks = kohlschuetter.make_features(big_html_doc)
        nblocks = len(blocks)
        features_normalized = np.zeros(features.shape)
        for k in xrange(6):
            features_normalized[:, k] = (features[:, k] - mean_std['mean'][k]) / mean_std['std'][k]
        blocks_keep_indices = np.arange(nblocks)[block_model.predict(features_normalized) > 0.5]

        actual_content = ' '.join([blocks[index].text for index in blocks_keep_indices])

        # check that the tokens are the same!
        self.assertEqual(re.split('\s+', actual_content.strip()),
                        re.split('\s+', content.strip()))


        # check that we maintain backward compatability
        from dragnet import DragnetModelKohlschuetterFeatures
        dmkf = DragnetModelKohlschuetterFeatures(block_model, mean_std)
        content_dragnetmodelkohschuetterfeatures = dmkf.analyze(big_html_doc)
        self.assertEqual(re.split('\s+', actual_content.strip()),
            re.split('\s+', content_dragnetmodelkohschuetterfeatures.strip()))
示例#6
0
    def test_map_train_model(self):
        trainer = TrainModelCV([LogisticRegression, classification_error, '/tmp/logistic.json', (), {'lam':0.5}], X=self.X, y=self.y)
        errors = trainer.run()

        # load model
        trained_model = LogisticRegression.load_model('/tmp/logistic.json')
        loaded_model_error = classification_error(self.y, trained_model.predict(self.X))

        # check the errors
        self.assertTrue(np.abs(errors[errors.keys()[0]]['train'] - 0.06) < 1e-12)
        self.assertTrue(np.abs(errors[errors.keys()[0]]['train'] - loaded_model_error) < 1e-12)
    def test_error_gradient(self):
        x0 = np.array([self.x[0]])
        x1 =  np.array([self.x[1]])
        error, gradient = LogisticRegression._loss_gradient(x0, x1, self.b, self.w, self.lam)

        # this assumes test_sigmoid pases
        err_act = -np.log(LogisticRegression._sigmoid(x1, self.b, self.w)) - np.log(1.0 - LogisticRegression._sigmoid(x0, self.b, self.w)) + 0.5 * 7 * 10
            
        pred_error = LogisticRegression._sigmoid(self.x, self.b, self.w) - self.t
        gradient_act = np.array([0.0, 7 * 3, 7 * -1])
        gradient_act[0] = np.sum(pred_error)
        gradient_act[1] += np.sum(pred_error * self.x[:, 0])
        gradient_act[2] += np.sum(pred_error * self.x[:, 1])

        self.assertTrue( abs(float(err_act) - error) < 1.0e-12 ) 
        self.assertTrue(np.all(np.abs(gradient - gradient_act) < 1.0e-12))

        # weighted case
        x00 = np.array([self.x[0], [55, -2]])
        error_weighted, gradient_weighted = LogisticRegression._loss_gradient(x00, x1, self.b, self.w, self.lam, [np.array([0.4, 0.75]), np.array(0.35)])
        err_weighted_act = -np.log(LogisticRegression._sigmoid(x1, self.b, self.w)) * 0.35 - np.log(1.0 - LogisticRegression._sigmoid(x0, self.b, self.w)) * 0.4 - np.log(1.0 - LogisticRegression._sigmoid([x00[1, :]], self.b, self.w)) * 0.75 + 0.5 * 7 * 10
        self.assertTrue( abs(float(err_weighted_act) - error_weighted) < 1.0e-12 )
示例#8
0
    def test_map_train_model(self):
        trainer = TrainModelCV([
            LogisticRegression, classification_error, '/tmp/logistic.json',
            (), {
                'lam': 0.5
            }
        ],
                               X=self.X,
                               y=self.y)
        errors = trainer.run()

        # load model
        trained_model = LogisticRegression.load_model('/tmp/logistic.json')
        loaded_model_error = classification_error(
            self.y, trained_model.predict(self.X))

        # check the errors
        self.assertTrue(
            np.abs(errors[errors.keys()[0]]['train'] - 0.06) < 1e-12)
        self.assertTrue(
            np.abs(errors[errors.keys()[0]]['train'] -
                   loaded_model_error) < 1e-12)
    def test_dragnet_model(self):
        params = {"b": 0.2, "w": [0.4, -0.2, 0.9, 0.8, -0.3, -0.5]}
        block_model = LogisticRegression.load_model(params)
        mean_std = {"mean": [0.0, 0.1, 0.2, 0.5, 0.0, 0.3], "std": [1.0, 2.0, 0.5, 1.2, 0.75, 1.3]}
        koh_features = NormalizedFeature(kohlschuetter_features, mean_std)

        dm = ContentExtractionModel(Blockifier, [koh_features], block_model, threshold=0.5)
        content = dm.analyze(big_html_doc)

        # make prediction from individual components
        # to do so, we use kohlschuetter.make_features and LogisticRegression
        features, blocks = kohlschuetter.make_features(big_html_doc)
        nblocks = len(blocks)
        features_normalized = np.zeros(features.shape)
        for k in xrange(6):
            features_normalized[:, k] = (features[:, k] - mean_std["mean"][k]) / mean_std["std"][k]
        blocks_keep_indices = np.arange(nblocks)[block_model.predict(features_normalized) > 0.5]

        actual_content = " ".join([blocks[index].text for index in blocks_keep_indices])

        # check that the tokens are the same!
        self.assertEqual(re.split("\s+", actual_content.strip()), re.split("\s+", content.strip()))
    def test_sigmoid(self):
        y = LogisticRegression._sigmoid(self.x, self.b, self.w)
        yact = np.array([1.0 / (1.0 + np.exp(-6)), 1.0 / (1.0 + np.exp(-1.5))])

        self.assertTrue(np.all(np.abs(y - yact) < 1.0e-12))
示例#11
0
    def test_sigmoid(self):
        y = LogisticRegression._sigmoid(self.x, self.b, self.w)
        yact = np.array([1.0 / (1.0 + np.exp(-6)), 1.0 / (1.0 + np.exp(-1.5))])

        self.assertTrue(np.all(np.abs(y - yact) < 1.0e-12))
示例#12
0
def train_models(datadir, output_prefix, features_to_use, lam=10):
    """Train a content extraction model.

    Does feature centering, trains the logistic regression model,
    pickles the final model and writes the train/test block level errors
    to a file

    datadir = root directory for all the data
    output_prefix = write the trained model files, errors, etc
        to files starting with this.
    features_to_use = a list of the features to use.  Must be one of the features
        known by AllFeatures
    lambda = lambda regularization parameter for LogisticRegression
    """
    import pprint
    import pickle
    from . import AllFeatures
    from .blocks import TagCountBlockifier as Blkr

    from mozsci.models import LogisticRegression
    from mozsci.numpy_util import NumpyEncoder

    # assemble the features
    feature_instances = []
    for f in features_to_use:
        feature_instances.append(AllFeatures.get(f))

    # compute the mean/std and save them
    data = DragnetModelData(datadir)
    trainer = DragnetModelTrainer()

    print "Initializing features"
    k = 0
    for f in feature_instances:
        # check to see if this feature needs to be init
        # if so, then init it, take the return object and serialize to json
        if hasattr(f, 'init_params'):
            # initialize it
            model_init = ContentExtractionModel(Blkr, [f], None)
            features, labels, weights = trainer.make_features_from_data(data, model_init, train=True)
            mean_std = f.init_params(features)
            f.set_params(mean_std)
            with open("%s_mean_std_%s.json" % (output_prefix, features_to_use[k]), 'w') as fout:
                fout.write("%s" % json.dumps(mean_std, cls=NumpyEncoder))
        k += 1

    model_to_train = ContentExtractionModel(Blkr, feature_instances, None)

    # train the model
    print "Training the model"
    model = LogisticRegression(lam=lam)
    features, labels, weights = trainer.make_features_from_data(data,
                         model_to_train, training_or_test='training')
    model.fit(features, labels, weights=np.minimum(weights, 200))

    print "Checking errors"
    train_errors = accuracy_auc(labels, model.predict(features), weights=weights)

    # check errors on test set
    test_features, test_labels, test_weights = trainer.make_features_from_data(data,
                     model_to_train, training_or_test='test')
    test_weights = np.minimum(test_weights, 200.0)
    test_errors = accuracy_auc(test_labels, model.predict(test_features), weights=test_weights)

    # write errors to a file
    with open(output_prefix + '_block_errors.txt', 'w') as f:
        f.write("Training errors for final model (block level):\n")
        pprint.pprint(train_errors, f)
        f.write("Test errors (block level):\n")
        pprint.pprint(test_errors, f)

    # pickle the final model!
    # use the one with threshold = 0.5
    pickle.dump(ContentExtractionModel(Blkr,
                        feature_instances,
                        model,
                        threshold=0.5), open(output_prefix + '_content_model.pickle', 'w'))

    print "done!"