示例#1
0
    def train_model(self, data, model_library, model):
        """data is an instance of DragnetModelData
        model_library is a list of model definitions as input to
         run_train_models
        model provides model.make_features to make the features
        """
        from mozsci.map_train import run_train_models

        # to train the model need a set of all the features and their labels
        # the features + labels are block level

        # get the features from the first document to see how many features we have
        features, labels, weights = self.make_features_from_data(data, model)

        # cap weights!
        weights = np.minimum(weights, 200)

        # do kfold cross validation
        folds = cv_kfold(len(labels), self.kfolds, seed=2)

        if self.weighted:
            errors = run_train_models(processes=4, model_library=model_library,
                X=features, y=labels, folds=folds, weights=weights)
        else:
            errors = run_train_models(processes=4, model_library=model_library,
                X=features, y=labels, folds=folds)

        return errors, features, labels, weights, folds
示例#2
0
    def train_model(self, data, model_library, features_to_use):
        """data is an instance of DragnetModelData
        model_library: the block_models to train as a list of model
            definitions as input to run_train_models
        features_to_use = a list of the features to use.  Must be one of
            the features known by AllFeatures
        """
        from . import AllFeatures
        from .blocks import TagCountReadabilityBlockifier as Blkr

        from mozsci.map_train import run_train_models

        # assemble the features
        feature_instances = []
        for f in features_to_use:
            feature_instances.append(AllFeatures.get(f))

        # do feature centering
        print("Initializing features")
        for f in feature_instances:
            # check to see if this feature needs to be init
            # if so, then init it, take the return object and serialize to json
            if hasattr(f, 'init_params'):
                # initialize it
                model_init = ContentExtractionModel(Blkr, [f], None)
                features, labels, weights = self.make_features_from_data(
                    data, model_init, train=True)
                mean_std = f.init_params(features)
                f.set_params(mean_std)

        model_to_train = ContentExtractionModel(Blkr, feature_instances, None)

        # train the model
        print("Training the model")
        features, labels, weights = self.make_features_from_data(
            data, model_to_train, training_or_test='training')

        # cap weights!
        weights = np.minimum(weights, 200)

        # do kfold cross validation
        if self.kfolds > 1:
            folds = cv_kfold(len(labels), self.kfolds, seed=2)
        else:
            folds = None

        if self.weighted:
            errors = run_train_models(
                processes=1, model_library=model_library,
                X=features, y=labels, folds=folds, weights=weights)
        else:
            errors = run_train_models(
                processes=1, model_library=model_library,
                X=features, y=labels, folds=folds)

        return errors, features, labels, weights, folds
示例#3
0
    def train_model(self, data, model_library, features_to_use):
        """data is an instance of DragnetModelData
        model_library: the block_models to train as a list of model
            definitions as input to run_train_models
        features_to_use = a list of the features to use.  Must be one of
            the features known by AllFeatures
        """
        from . import AllFeatures
        from .blocks import TagCountReadabilityBlockifier as Blkr

        from mozsci.map_train import run_train_models

        # assemble the features
        feature_instances = []
        for f in features_to_use:
            feature_instances.append(AllFeatures.get(f))

        # do feature centering
        print("Initializing features")
        for f in feature_instances:
            # check to see if this feature needs to be init
            # if so, then init it, take the return object and serialize to json
            if hasattr(f, 'init_params'):
                # initialize it
                model_init = ContentExtractionModel(Blkr, [f], None)
                features, labels, weights = self.make_features_from_data(
                    data, model_init, train=True)
                mean_std = f.init_params(features)
                f.set_params(mean_std)

        model_to_train = ContentExtractionModel(Blkr, feature_instances, None)

        # train the model
        print("Training the model")
        features, labels, weights = self.make_features_from_data(
            data, model_to_train, training_or_test='training')

        # cap weights!
        weights = np.minimum(weights, 200)

        # do kfold cross validation
        if self.kfolds > 1:
            folds = cv_kfold(len(labels), self.kfolds, seed=2)
        else:
            folds = None

        if self.weighted:
            errors = run_train_models(
                processes=1, model_library=model_library,
                X=features, y=labels, folds=folds, weights=weights)
        else:
            errors = run_train_models(
                processes=1, model_library=model_library,
                X=features, y=labels, folds=folds)

        return errors, features, labels, weights, folds
示例#4
0
    def test_run_train_models(self):
        import re

        model_library = [[LogisticRegression, classification_error, None, (), {'lam':0.5}],
          [LogisticRegression, classification_error, None, (), {'lam':50}]]

        errors = run_train_models(2, model_library, X=self.X, y=self.y)
        for k in errors.keys():
            if re.search("{'lam': 0.5}", k):
                err_check = errors[k]

        self.assertTrue(abs(err_check['train'] - 0.06) < 1e-8)
示例#5
0
    def test_run_train_models(self):
        import re

        model_library = [[
            LogisticRegression, classification_error, None, (), {
                'lam': 0.5
            }
        ], [LogisticRegression, classification_error, None, (), {
            'lam': 50
        }]]

        errors = run_train_models(2, model_library, X=self.X, y=self.y)
        for k in errors.keys():
            if re.search("{'lam': 0.5}", k):
                err_check = errors[k]

        self.assertTrue(abs(err_check['train'] - 0.06) < 1e-8)