def build_target(self): y, ff = build_target_safe(self.target, self.data) self.y = y self.negatives = y[~y.astype('bool')].index self.positives = y[y.astype('bool')].index self.n_positives = len(self.positives) self.n_negatives = len(self.negatives)
def cross_validate(model_def, data, folds, repeat=1): """ """ results = [] if isinstance(folds, int): folds = make_default_folds(num_folds=folds, data=data) for i in range(repeat): for fold in folds: if len(fold) == 2: train_index, test_index = fold prep_index = None elif len(fold) == 3: train_index, test_index, prep_index = fold else: raise ValueError("Fold is not of right dimension (%d, not 2 or 3)"%len(fold)) assert len(train_index & test_index) == 0, "train and test overlap!!! %s, %s" % (train_index, test_index) x_train, y_train, fitted_model = fit_model(model_def, data, prep_index, train_index) test_data = data.loc[test_index] x_test, y_test = generate_test(model_def, test_data, fitted_model) assert len(x_train.index & x_test.index) == 0, "train and test overlap!!! %s" % (x_train.index & x_test.index) y_preds = predict(fitted_model, x_test) if model_def.evaluation_target is not None: y_test, ff = build_target_safe(model_def.evaluation_target, test_data) result = Result(x_train, x_test, y_train, y_test, y_preds, model_def, fitted_model, data) results.append(result) # for reporter in reporters: # reporter.update(result) return results
def generate_train(model_def, data, prep_index=None, train_index=None): # create training set data, prep_index, train_index = filter_data_and_indexes(model_def, data, prep_index, train_index) x_train, fitted_features = build_featureset_safe(model_def.features, data, prep_index, train_index) y_train, fitted_target = build_target_safe(model_def.target, data, prep_index, train_index) x_train = x_train.reindex(train_index) y_train = y_train.reindex(train_index) return x_train, y_train, fitted_features, fitted_target
def predict(fitted_model, x_data): model_def = fitted_model.model_def predictions = fitted_model.fitted_estimator.predict(x_data) predictions = pd.Series(predictions, index=x_data.index) if model_def.evaluation_transformation is not None: x_data[model_def.predictions_name] = predictions predictions, ff = build_target_safe(model_def.evaluation_transformation, x_data) del x_data[model_def.predictions_name] return predictions
def _train(self, train_datas): train_data = concat(train_datas, axis=1) y, ff = build_target_safe(self.target, self.data) y = reindex_safe(y, train_data.index) arg = self.threshold_arg if arg is None: arg = self.n_keep cols = self.selector.select(train_data, y, arg) return cols
def _train(self, train_datas): train_data = concat(train_datas, axis=1) y, ff = build_target_safe(self.target, self.data) y = reindex_safe(y, train_data.index) arg = self.threshold_arg if arg is None: arg = self.n_keep cols = self.selector.select(train_data, y, arg) return cols
def predict(fitted_model, x_data): model_def = fitted_model.model_def predictions = fitted_model.fitted_estimator.predict(x_data) predictions = pd.Series(predictions, index=x_data.index) if model_def.evaluation_transformation is not None: x_data[model_def.predictions_name] = predictions predictions, ff = build_target_safe( model_def.evaluation_transformation, x_data) del x_data[model_def.predictions_name] return predictions
def generate_train(model_def, data, prep_index=None, train_index=None): # create training set data, prep_index, train_index = filter_data_and_indexes( model_def, data, prep_index, train_index) x_train, fitted_features = build_featureset_safe(model_def.features, data, prep_index, train_index) y_train, fitted_target = build_target_safe(model_def.target, data, prep_index, train_index) x_train = x_train.reindex(train_index) y_train = y_train.reindex(train_index) return x_train, y_train, fitted_features, fitted_target
def _train(self, train_data): y, ff = build_target_safe(self.target, train_data) vc = train_data[self.group_by].value_counts() keys = [k for k, v in vc.iterkv() if v >= self.min_sample] train_data['__grouping'] = train_data[self.group_by].map(lambda x: x if x in keys else '__other') train_data['__target'] = y vals = train_data.groupby('__grouping').agg({'__target': self.func})['__target'].to_dict() logging.debug("Preparing Target Aggregations:") logging.debug(str(vals.items()[:10])) del train_data['__target'] del train_data['__grouping'] return (keys, vals)
def fit_model(model_def, data, prep_index=None, train_index=None): # create training set x_train, fitted_features = build_featureset_safe(model_def.features, data, prep_index, train_index) y_train, fitted_target = build_target_safe(model_def.target, data, prep_index, train_index) # fit estimator model_def.estimator.fit(x_train, y_train) # unnecesary? fitted_estimator = FittedEstimator(model_def.estimator, x_train, y_train) fitted_model = FittedModel(model_def, fitted_features, fitted_target, fitted_estimator) return x_train, y_train, fitted_model
def _train(self, train_data): y, ff = build_target_safe(self.target, train_data) train_data['__target'] = y global_value = self.func(y) if self.regularize: keys = train_data[self.group_by].unique() f = lambda x: (self.func(x) * x.size + global_value * self.min_sample) / (x.size + self.min_sample) vals = train_data.groupby(self.group_by).agg({'__target': f})['__target'].to_dict() else: vc = train_data[self.group_by].value_counts() keys = [k for k, v in vc.iterkv() if v >= self.min_sample] train_data['__grouping'] = train_data[self.group_by].map(lambda x: x if x in keys else '__other') vals = train_data.groupby('__grouping').agg({'__target': self.func})['__target'].to_dict() del train_data['__grouping'] if '__other' not in vals: vals['__other'] = global_value logging.debug("Preparing Target Aggregations:") logging.debug(str(vals.items()[:10])) del train_data['__target'] return vals
def cross_validate(model_def, data, folds, repeat=1): """ """ results = [] if isinstance(folds, int): folds = make_default_folds(num_folds=folds, data=data) for i in range(repeat): for fold in folds: if len(fold) == 2: train_index, test_index = fold prep_index = None elif len(fold) == 3: train_index, test_index, prep_index = fold else: raise ValueError( "Fold is not of right dimension (%d, not 2 or 3)" % len(fold)) assert len( train_index & test_index) == 0, "train and test overlap!!! %s, %s" % ( train_index, test_index) x_train, y_train, fitted_model = fit_model(model_def, data, prep_index, train_index) test_data = data.loc[test_index] x_test, y_test = generate_test(model_def, test_data, fitted_model) assert len(x_train.index & x_test.index) == 0, "train and test overlap!!! %s" % ( x_train.index & x_test.index) y_preds = predict(fitted_model, x_test) if model_def.evaluation_target is not None: y_test, ff = build_target_safe(model_def.evaluation_target, test_data) result = Result(x_train, x_test, y_train, y_test, y_preds, model_def, fitted_model, data) results.append(result) # for reporter in reporters: # reporter.update(result) return results
def _train(self, train_data): y, ff = build_target_safe(self.target, train_data) train_data['__target'] = y global_value = self.func(y) if self.regularize: keys = train_data[self.group_by].unique() f = lambda x: (self.func(x) * x.size + global_value * self. min_sample) / (x.size + self.min_sample) vals = train_data.groupby(self.group_by).agg( {'__target': f})['__target'].to_dict() else: vc = train_data[self.group_by].value_counts() keys = [k for k, v in vc.iterkv() if v >= self.min_sample] train_data['__grouping'] = train_data[self.group_by].map( lambda x: x if x in keys else '__other') vals = train_data.groupby('__grouping').agg( {'__target': self.func})['__target'].to_dict() del train_data['__grouping'] if '__other' not in vals: vals['__other'] = global_value logging.debug("Preparing Target Aggregations:") logging.debug(str(vals.items()[:10])) del train_data['__target'] return vals
def generate_train(model_def, data, prep_index=None, train_index=None): # create training set x_train, fitted_features = build_featureset_safe(model_def.features, data, prep_index, train_index) y_train, fitted_target = build_target_safe(model_def.target, data, prep_index, train_index) return x_train, y_train, fitted_features, fitted_target