def train_model(self, data, model_library, model): """data is an instance of DragnetModelData model_library is a list of model definitions as input to run_train_models model provides model.make_features to make the features """ from mozsci.map_train import run_train_models # to train the model need a set of all the features and their labels # the features + labels are block level # get the features from the first document to see how many features we have features, labels, weights = self.make_features_from_data(data, model) # cap weights! weights = np.minimum(weights, 200) # do kfold cross validation folds = cv_kfold(len(labels), self.kfolds, seed=2) if self.weighted: errors = run_train_models(processes=4, model_library=model_library, X=features, y=labels, folds=folds, weights=weights) else: errors = run_train_models(processes=4, model_library=model_library, X=features, y=labels, folds=folds) return errors, features, labels, weights, folds
def train_model(self, data, model_library, features_to_use): """data is an instance of DragnetModelData model_library: the block_models to train as a list of model definitions as input to run_train_models features_to_use = a list of the features to use. Must be one of the features known by AllFeatures """ from . import AllFeatures from .blocks import TagCountReadabilityBlockifier as Blkr from mozsci.map_train import run_train_models # assemble the features feature_instances = [] for f in features_to_use: feature_instances.append(AllFeatures.get(f)) # do feature centering print("Initializing features") for f in feature_instances: # check to see if this feature needs to be init # if so, then init it, take the return object and serialize to json if hasattr(f, 'init_params'): # initialize it model_init = ContentExtractionModel(Blkr, [f], None) features, labels, weights = self.make_features_from_data( data, model_init, train=True) mean_std = f.init_params(features) f.set_params(mean_std) model_to_train = ContentExtractionModel(Blkr, feature_instances, None) # train the model print("Training the model") features, labels, weights = self.make_features_from_data( data, model_to_train, training_or_test='training') # cap weights! weights = np.minimum(weights, 200) # do kfold cross validation if self.kfolds > 1: folds = cv_kfold(len(labels), self.kfolds, seed=2) else: folds = None if self.weighted: errors = run_train_models( processes=1, model_library=model_library, X=features, y=labels, folds=folds, weights=weights) else: errors = run_train_models( processes=1, model_library=model_library, X=features, y=labels, folds=folds) return errors, features, labels, weights, folds
def test_run_train_models(self): import re model_library = [[LogisticRegression, classification_error, None, (), {'lam':0.5}], [LogisticRegression, classification_error, None, (), {'lam':50}]] errors = run_train_models(2, model_library, X=self.X, y=self.y) for k in errors.keys(): if re.search("{'lam': 0.5}", k): err_check = errors[k] self.assertTrue(abs(err_check['train'] - 0.06) < 1e-8)
def test_run_train_models(self): import re model_library = [[ LogisticRegression, classification_error, None, (), { 'lam': 0.5 } ], [LogisticRegression, classification_error, None, (), { 'lam': 50 }]] errors = run_train_models(2, model_library, X=self.X, y=self.y) for k in errors.keys(): if re.search("{'lam': 0.5}", k): err_check = errors[k] self.assertTrue(abs(err_check['train'] - 0.06) < 1e-8)