def mean_model(features, solutions, verbose=0): columns = solutions.columns clf = DummyRegressor() print('Training Model... ') clf.fit(features, solutions) print('Done Training') return (clf, columns)
def test_regressor(): X = [[0]] * 4 # ignored y = [1, 2, 1, 1] reg = DummyRegressor() reg.fit(X, y) assert_array_equal(reg.predict(X), [5. / 4] * len(X))
def test_quantile_strategy_multioutput_regressor(): random_state = np.random.RandomState(seed=1) X_learn = random_state.randn(10, 10) y_learn = random_state.randn(10, 5) median = np.median(y_learn, axis=0).reshape((1, -1)) quantile_values = np.percentile(y_learn, axis=0, q=80).reshape((1, -1)) X_test = random_state.randn(20, 10) y_test = random_state.randn(20, 5) # Correctness oracle est = DummyRegressor(strategy="quantile", quantile=0.5) est.fit(X_learn, y_learn) y_pred_learn = est.predict(X_learn) y_pred_test = est.predict(X_test) _check_equality_regressor( median, y_learn, y_pred_learn, y_test, y_pred_test) _check_behavior_2d(est) # Correctness oracle est = DummyRegressor(strategy="quantile", quantile=0.8) est.fit(X_learn, y_learn) y_pred_learn = est.predict(X_learn) y_pred_test = est.predict(X_test) _check_equality_regressor( quantile_values, y_learn, y_pred_learn, y_test, y_pred_test) _check_behavior_2d(est)
def the_mocked_model(): X, y = make_regression(n_samples=500, n_features=2) model = DummyRegressor() model.fit(X, y) return model
def dumb_regressor_result(x_test, x_train, y_test, y_train): """ Dumb regressor, predict only the mean value for each target variable, returns MAE and MSE metrics per each variable. Args: x_test: validation samples x_train: training samples y_test: validation target y_train: training target Returns: dumb_metrics: list of metrics results after dumb regression """ dumb_reg = DummyRegressor() fake_data = np.zeros((x_train.shape[0], 1)) fake_test = np.zeros((1, 1)) dumb_reg.fit(fake_data, y_train) dumb_pred = dumb_reg.predict(fake_test)[0] dumb_metrics = [] for i in range(dumb_pred.size): dumb_pred_var = np.full((x_test.shape[0], 1), dumb_pred[i]) dumb_mse_var = mean_squared_error(y_test[:, i], dumb_pred_var) dumb_mae_var = mean_absolute_error(y_test[:, i], dumb_pred_var) dumb_metrics.append([dumb_mse_var, dumb_mae_var]) return dumb_metrics
def test_y_mean_attribute_regressor(): X = [[0]] * 5 y = [1, 2, 4, 6, 8] # when strategy = 'mean' est = DummyRegressor(strategy='mean') est.fit(X, y) assert_equal(est.y_mean_, np.mean(y))
def test_weights_regressor(): """Check weighted average regression prediction on boston dataset.""" reg1 = DummyRegressor(strategy='mean') reg2 = DummyRegressor(strategy='median') reg3 = DummyRegressor(strategy='quantile', quantile=.2) ereg = VotingRegressor([('mean', reg1), ('median', reg2), ('quantile', reg3)], weights=[1, 2, 10]) X_r_train, X_r_test, y_r_train, y_r_test = \ train_test_split(X_r, y_r, test_size=.25) reg1_pred = reg1.fit(X_r_train, y_r_train).predict(X_r_test) reg2_pred = reg2.fit(X_r_train, y_r_train).predict(X_r_test) reg3_pred = reg3.fit(X_r_train, y_r_train).predict(X_r_test) ereg_pred = ereg.fit(X_r_train, y_r_train).predict(X_r_test) avg = np.average(np.asarray([reg1_pred, reg2_pred, reg3_pred]), axis=0, weights=[1, 2, 10]) assert_almost_equal(ereg_pred, avg, decimal=2) ereg_weights_none = VotingRegressor([('mean', reg1), ('median', reg2), ('quantile', reg3)], weights=None) ereg_weights_equal = VotingRegressor([('mean', reg1), ('median', reg2), ('quantile', reg3)], weights=[1, 1, 1]) ereg_weights_none.fit(X_r_train, y_r_train) ereg_weights_equal.fit(X_r_train, y_r_train) ereg_none_pred = ereg_weights_none.predict(X_r_test) ereg_equal_pred = ereg_weights_equal.predict(X_r_test) assert_almost_equal(ereg_none_pred, ereg_equal_pred, decimal=2)
def train_classifier(): X_train = tfv.transform(video_captions_train) X_test = tfv.transform(video_captions_test) dummy = DummyRegressor(strategy="median") dummy.fit(X_train, Y_train) Y_pred_med = dummy.predict(X_test)
def test_constants_not_specified_regressor(): X = [[0]] * 5 y = [1, 2, 4, 6, 8] est = DummyRegressor(strategy='constant') with pytest.raises(TypeError): est.fit(X, y)
def test_quantile_strategy_multioutput_regressor(): random_state = np.random.RandomState(seed=1) X_learn = random_state.randn(10, 10) y_learn = random_state.randn(10, 5) median = np.median(y_learn, axis=0).reshape((1, -1)) quantile_values = np.percentile(y_learn, axis=0, q=80).reshape((1, -1)) X_test = random_state.randn(20, 10) y_test = random_state.randn(20, 5) # Correctness oracle est = DummyRegressor(strategy="quantile", quantile=0.5) est.fit(X_learn, y_learn) y_pred_learn = est.predict(X_learn) y_pred_test = est.predict(X_test) _check_equality_regressor(median, y_learn, y_pred_learn, y_test, y_pred_test) _check_behavior_2d(est) # Correctness oracle est = DummyRegressor(strategy="quantile", quantile=0.8) est.fit(X_learn, y_learn) y_pred_learn = est.predict(X_learn) y_pred_test = est.predict(X_test) _check_equality_regressor(quantile_values, y_learn, y_pred_learn, y_test, y_pred_test) _check_behavior_2d(est)
def train_intelligence(dataframe, text_column, classification_column): vectorizer = TfidfVectorizer(lowercase=False) bag_of_words = vectorizer.fit_transform(dataframe[text_column]) train, test, class_train, class_test = train_test_split( bag_of_words, dataframe[classification_column], random_state=42, test_size=0.25) logistic_regression = LogisticRegression() logistic_regression.fit(train, class_train) pesos = pd.DataFrame(logistic_regression.coef_[0].T, index=vectorizer.get_feature_names()) print(pesos.nlargest(10, 0)) print(pesos.nsmallest(10, 0)) filename = 'anton_brain.sav' pickle.dump(logistic_regression, open(filename, 'wb')) filename = 'anton_vectorizer.sav' pickle.dump(vectorizer, open(filename, 'wb')) ###### baseline ###### baseline = DummyRegressor(strategy="mean") baseline.fit(train, class_train) print('Baseline Accuracy: ') print(round(baseline.predict(class_test)[0] * 100, 2)) ###### baseline ###### print('Algorithm Accuracy:') print(round(logistic_regression.score(test, class_test) * 100, 2)) return
def test_weights_regressor(): """Check weighted average regression prediction on diabetes dataset.""" reg1 = DummyRegressor(strategy="mean") reg2 = DummyRegressor(strategy="median") reg3 = DummyRegressor(strategy="quantile", quantile=0.2) ereg = VotingRegressor( [("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=[1, 2, 10] ) X_r_train, X_r_test, y_r_train, y_r_test = train_test_split( X_r, y_r, test_size=0.25 ) reg1_pred = reg1.fit(X_r_train, y_r_train).predict(X_r_test) reg2_pred = reg2.fit(X_r_train, y_r_train).predict(X_r_test) reg3_pred = reg3.fit(X_r_train, y_r_train).predict(X_r_test) ereg_pred = ereg.fit(X_r_train, y_r_train).predict(X_r_test) avg = np.average( np.asarray([reg1_pred, reg2_pred, reg3_pred]), axis=0, weights=[1, 2, 10] ) assert_almost_equal(ereg_pred, avg, decimal=2) ereg_weights_none = VotingRegressor( [("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=None ) ereg_weights_equal = VotingRegressor( [("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=[1, 1, 1] ) ereg_weights_none.fit(X_r_train, y_r_train) ereg_weights_equal.fit(X_r_train, y_r_train) ereg_none_pred = ereg_weights_none.predict(X_r_test) ereg_equal_pred = ereg_weights_equal.predict(X_r_test) assert_almost_equal(ereg_none_pred, ereg_equal_pred, decimal=2)
class MeanRegressor(BaseEstimator): def __init__(self): """ Model predicting the mean value of the target label """ self.reg = DummyRegressor(strategy='mean') def fit(self, X, y): """ Fits the model :param X: input dataframe :param y: labels dataframe :return: """ X = X['user_id'] self.reg.fit(X, y) def predict(self, X): """ Predicts the values with given input data :param X: input dataframe :return: list of predicted values """ X = X['user_id'] return self.reg.predict(X)
def test_unknown_strategey_regressor(): X = [[0]] * 5 y = [1, 2, 4, 6, 8] est = DummyRegressor(strategy='gona') with pytest.raises(ValueError): est.fit(X, y)
class GradientBoosting: def __init__(self, n_estimators=100, learning_rate=0.1, subsamples=1.0, max_features=None, max_depth=3, min_leaf_size=1, init=None): self.n_estimators = n_estimators self.learning_rate = learning_rate self.subsamples = subsamples self.max_features = max_features self.max_depth = max_depth self.min_leaf_size = min_leaf_size if init is None: self.init_model = DummyRegressor() else: self.init_model = init self.estimators = [] self.weights = [] self.loss_by_iter = [] def fit(self, X, y): if self.init_model == 'zeros': pred = np.zeros(y.shape) else: self.init_model.fit(X, y) pred = self.init_model.predict(X) res = pred.copy() self.loss_by_iter.append(mse(res, y)) for i in tqdm(range(self.n_estimators)): grad = (y - res) cur_estimator = CART(max_depth=self.max_depth, min_leaf_size=self.min_leaf_size, max_features=self.max_features) if self.subsamples < 1.0: sample_ids = np.arange(y.shape[0]) np.random.shuffle(sample_ids) sample_ids = sample_ids[:int(y.shape[0] * self.subsamples)] cur_estimator.fit(X[sample_ids], grad[sample_ids]) else: cur_estimator.fit(X, grad) self.estimators.append(cur_estimator) pred = cur_estimator.predict(X) # b = golden_section(res, pred, y) b = 1 self.weights.append(b) res += self.learning_rate * b * pred self.loss_by_iter.append(mse(res, y)) def predict(self, X): pred = self.init_model.predict(X) for i in range(self.n_estimators): pred += self.learning_rate * self.weights[i] * self.estimators[ i].predict(X) return pred
def test_dummy_regressor_on_nan_value(): X = [[np.NaN]] y = [1] y_expected = [1] clf = DummyRegressor() clf.fit(X, y) y_pred = clf.predict(X) assert_array_equal(y_pred, y_expected)
def dummy_regressor(X, y, args={}): """ 用于使用随机预测得到的结果,用于和其他模型进行对比 """ from sklearn.dummy import DummyRegressor clf = DummyRegressor(**args) clf.fit(X, y) return clf
def test_constants_not_specified_regressor(): X = [[0]] * 5 y = [1, 2, 4, 6, 8] est = DummyRegressor(strategy="constant") err_msg = "Constant target value has to be specified" with pytest.raises(TypeError, match=err_msg): est.fit(X, y)
def dummy_regressor(X_train, X_test, y_train, y_test): dummy_clf = DummyRegressor() dummy_clf.fit(X_train, y_train) dummy_pred = dummy_clf.predict(X_test) # Evaluate the root mean squared error calculated_error = np.sqrt(mean_squared_error(y_test, dummy_pred)) return round(calculated_error,3)
def test_y_mean_attribute_regressor(): X = [[0]] * 5 y = [1, 2, 4, 6, 8] # when strategy = 'mean' est = DummyRegressor(strategy="mean") est.fit(X, y) assert est.constant_ == np.mean(y)
def test_dummy_regressor_on_3D_array(): X = np.array([[['foo']], [['bar']], [['baz']]]) y = np.array([2, 2, 2]) y_expected = np.array([2, 2, 2]) cls = DummyRegressor() cls.fit(X, y) y_pred = cls.predict(X) assert_array_equal(y_pred, y_expected)
def test_constant_size_multioutput_regressor(): random_state = np.random.RandomState(seed=1) X = random_state.randn(10, 10) y = random_state.randn(10, 5) est = DummyRegressor(strategy="constant", constant=[1, 2, 3, 4]) with pytest.raises(ValueError): est.fit(X, y)
class Regressor(BaseEstimator): def __init__(self): self.clf = DummyRegressor() def fit(self, X, y): self.clf.fit(X, y) def predict(self, X): return self.clf.predict(X)
def execute(self): from sklearn.dummy import DummyRegressor dummy_regr = DummyRegressor(strategy="mean") dummy_regr.fit(self.partitions.x_train, self.partitions.y_train) y_pred = dummy_regr.predict(self.partitions.x_test) self.y_pred = y_pred return self.y_pred, self.partitions.y_test
def main(): X_train = pd.read_csv("data/X_train.csv") y_train = pd.read_csv("data/y_train.csv") dummy_reg = DummyRegressor() dummy_reg.fit(X_train, y_train) pickle.dump(dummy_reg, open("models/dummy_reg.pkl", 'wb'))
def dummy_regressor_accuracy(x, y, evaluator: Callable, strategy: str = 'mean'): dummy = DummyRegressor(strategy) dummy.fit(x, y) y_hat = dummy.predict(x) print('DummyRegressor accuracy:', evaluator(y_hat, y)) return dummy
def test_median_strategy_regressor(): random_state = np.random.RandomState(seed=1) X = [[0]] * 5 # ignored y = random_state.randn(5) reg = DummyRegressor(strategy="median") reg.fit(X, y) assert_array_equal(reg.predict(X), [np.median(y)] * len(X))
def test_scorer_sample_weight(): """Test that scorers support sample_weight or raise sensible errors""" # Unlike the metrics invariance test, in the scorer case it's harder # to ensure that, on the classifier output, weighted and unweighted # scores really should be unequal. X, y = make_classification(random_state=0) _, y_ml = make_multilabel_classification(n_samples=X.shape[0], return_indicator=True, random_state=0) split = train_test_split(X, y, y_ml, random_state=0) X_train, X_test, y_train, y_test, y_ml_train, y_ml_test = split sample_weight = np.ones_like(y_test) sample_weight[:10] = 0 # get sensible estimators for each metric sensible_regr = DummyRegressor(strategy='median') sensible_regr.fit(X_train, y_train) sensible_clf = DecisionTreeClassifier(random_state=0) sensible_clf.fit(X_train, y_train) sensible_ml_clf = DecisionTreeClassifier(random_state=0) sensible_ml_clf.fit(X_train, y_ml_train) estimator = dict([(name, sensible_regr) for name in REGRESSION_SCORERS] + [(name, sensible_clf) for name in CLF_SCORERS] + [(name, sensible_ml_clf) for name in MULTILABEL_ONLY_SCORERS]) for name, scorer in SCORERS.items(): if name in MULTILABEL_ONLY_SCORERS: target = y_ml_test else: target = y_test try: weighted = scorer(estimator[name], X_test, target, sample_weight=sample_weight) ignored = scorer(estimator[name], X_test[10:], target[10:]) unweighted = scorer(estimator[name], X_test, target) assert_not_equal(weighted, unweighted, msg="scorer {0} behaves identically when " "called with sample weights: {1} vs " "{2}".format(name, weighted, unweighted)) assert_almost_equal(weighted, ignored, err_msg="scorer {0} behaves differently when " "ignoring samples and setting sample_weight to" " 0: {1} vs {2}".format( name, weighted, ignored)) except TypeError as e: assert_true( "sample_weight" in str(e), "scorer {0} raises unhelpful exception when called " "with sample weights: {1}".format(name, str(e)))
def train_dummy_regressors(features, target): for strat in ['mean', 'median']: dr = DummyRegressor(strategy=strat) dr.fit(features, y=target.flatten()) dummy_score = (100 * dr.score(features, target)) print('{:.1f} % score for a dummy regressor using the {} stragety'.format( dummy_score, dr.get_params()['strategy']))
def test_constant_size_multioutput_regressor(): random_state = np.random.RandomState(seed=1) X = random_state.randn(10, 10) y = random_state.randn(10, 5) est = DummyRegressor(strategy="constant", constant=[1, 2, 3, 4]) err_msg = r"Constant target value should have shape \(5, 1\)." with pytest.raises(ValueError, match=err_msg): est.fit(X, y)
def test_scorer_sample_weight(): # Test that scorers support sample_weight or raise sensible errors # Unlike the metrics invariance test, in the scorer case it's harder # to ensure that, on the classifier output, weighted and unweighted # scores really should be unequal. X, y = make_classification(random_state=0) _, y_ml = make_multilabel_classification(n_samples=X.shape[0], random_state=0) split = train_test_split(X, y, y_ml, random_state=0) X_train, X_test, y_train, y_test, y_ml_train, y_ml_test = split sample_weight = np.ones_like(y_test) sample_weight[:10] = 0 # get sensible estimators for each metric sensible_regr = DummyRegressor(strategy="median") sensible_regr.fit(X_train, y_train) sensible_clf = DecisionTreeClassifier(random_state=0) sensible_clf.fit(X_train, y_train) sensible_ml_clf = DecisionTreeClassifier(random_state=0) sensible_ml_clf.fit(X_train, y_ml_train) estimator = dict( [(name, sensible_regr) for name in REGRESSION_SCORERS] + [(name, sensible_clf) for name in CLF_SCORERS] + [(name, sensible_ml_clf) for name in MULTILABEL_ONLY_SCORERS] ) for name, scorer in SCORERS.items(): if name in MULTILABEL_ONLY_SCORERS: target = y_ml_test else: target = y_test try: weighted = scorer(estimator[name], X_test, target, sample_weight=sample_weight) ignored = scorer(estimator[name], X_test[10:], target[10:]) unweighted = scorer(estimator[name], X_test, target) assert_not_equal( weighted, unweighted, msg="scorer {0} behaves identically when " "called with sample weights: {1} vs " "{2}".format(name, weighted, unweighted), ) assert_almost_equal( weighted, ignored, err_msg="scorer {0} behaves differently when " "ignoring samples and setting sample_weight to" " 0: {1} vs {2}".format(name, weighted, ignored), ) except TypeError as e: assert_true( "sample_weight" in str(e), "scorer {0} raises unhelpful exception when called " "with sample weights: {1}".format(name, str(e)), )
class DummyEstimator(BaseTesterEstimator): def __init__(self): self.regressor = DummyRegressor() def fit(self, x, y): self.regressor.fit(x, y) def predict(self, x): return self.regressor.predict(x)
def test_dummy_regressor_return_std(): X = [[0]] * 3 # ignored y = np.array([2, 2, 2]) y_std_expected = np.array([0, 0, 0]) cls = DummyRegressor() cls.fit(X, y) y_pred_list = cls.predict(X, return_std=True) # there should be two elements when return_std is True assert len(y_pred_list) == 2 # the second element should be all zeros assert_array_equal(y_pred_list[1], y_std_expected)
def test_quantile_invalid(): X = [[0]] * 5 # ignored y = [0] * 5 # ignored est = DummyRegressor(strategy="quantile", quantile=None) err_msg = ( "When using `strategy='quantile', you have to specify the desired quantile" ) with pytest.raises(ValueError, match=err_msg): est.fit(X, y)
def test_dummy_regressor_return_std(): X = [[0]] * 3 # ignored y = np.array([2, 2, 2]) y_std_expected = np.array([0, 0, 0]) cls = DummyRegressor() cls.fit(X, y) y_pred_list = cls.predict(X, return_std=True) # there should be two elements when return_std is True assert_equal(len(y_pred_list), 2) # the second element should be all zeros assert_array_equal(y_pred_list[1], y_std_expected)
def train_average_predictor(journeys: DataFrame, last: str, target: str) -> Optional[Any]: journeys = journeys[pd.notnull(journeys[target]) & pd.notnull(journeys[last])] y = travel_times(journeys, [], last, target).astype("int64") / 1_000_000_000 if len(y) > 0: predictor = DummyRegressor(strategy="median") predictor.fit(journeys, y) return predictor else: return None
def baseline(): from sklearn.dummy import DummyRegressor baseline = DummyRegressor(strategy='mean') baseline.fit(X_train_scaled, y_train) y_pred_train = baseline.predict(X_train_scaled) #y_pred_train_round = np.round(y_pred_train) y_pred_test = baseline.predict(X_test_scaled) #y_pred_test_round = np.round(y_pred_test) print(r2_score(y_test, y_pred_test)) #print (lm.score(X_test_scaled, y_test)) #plot_conf_mat(y_test, y_pred_round) return scores_results(y_train, y_test, y_pred_train, y_pred_test)
def test_regressor_prediction_independent_of_X(strategy): y = [0, 2, 1, 1] X1 = [[0]] * 4 reg1 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7) reg1.fit(X1, y) predictions1 = reg1.predict(X1) X2 = [[1]] * 4 reg2 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7) reg2.fit(X2, y) predictions2 = reg2.predict(X2) assert_array_equal(predictions1, predictions2)
def _make_estimators(X_train, y_train, y_ml_train): # Make estimators that make sense to test various scoring methods sensible_regr = DummyRegressor(strategy='median') sensible_regr.fit(X_train, y_train) sensible_clf = DecisionTreeClassifier(random_state=0) sensible_clf.fit(X_train, y_train) sensible_ml_clf = DecisionTreeClassifier(random_state=0) sensible_ml_clf.fit(X_train, y_ml_train) return dict( [(name, sensible_regr) for name in REGRESSION_SCORERS] + [(name, sensible_clf) for name in CLF_SCORERS] + [(name, sensible_ml_clf) for name in MULTILABEL_ONLY_SCORERS] )
def test_constant_strategy_regressor(): random_state = np.random.RandomState(seed=1) X = [[0]] * 5 # ignored y = random_state.randn(5) reg = DummyRegressor(strategy="constant", constant=[43]) reg.fit(X, y) assert_array_equal(reg.predict(X), [43] * len(X)) reg = DummyRegressor(strategy="constant", constant=43) reg.fit(X, y) assert_array_equal(reg.predict(X), [43] * len(X))
def test_multioutput_regressor(): X_learn = np.random.randn(10, 10) y_learn = np.random.randn(10, 5) mean = np.mean(y_learn, axis=0).reshape((1, -1)) X_test = np.random.randn(20, 10) y_test = np.random.randn(20, 5) # Correctness oracle est = DummyRegressor() est.fit(X_learn, y_learn) y_pred_learn = est.predict(X_learn) y_pred_test = est.predict(X_test) assert_array_equal(np.tile(mean, (y_learn.shape[0], 1)), y_pred_learn) assert_array_equal(np.tile(mean, (y_test.shape[0], 1)), y_pred_test) _check_behavior_2d(est)
def test_mean_strategy_multioutput_regressor(): random_state = np.random.RandomState(seed=1) X_learn = random_state.randn(10, 10) y_learn = random_state.randn(10, 5) mean = np.mean(y_learn, axis=0).reshape((1, -1)) X_test = random_state.randn(20, 10) y_test = random_state.randn(20, 5) # Correctness oracle est = DummyRegressor() est.fit(X_learn, y_learn) y_pred_learn = est.predict(X_learn) y_pred_test = est.predict(X_test) _check_equality_regressor(mean, y_learn, y_pred_learn, y_test, y_pred_test) _check_behavior_2d(est)
def _minimize_simbo_general(fun, x0, # only used to get number of features args=(), callback=None, batch_size=100, population_size=10000, maxiter=10000, scorer=None, # if no scorer given, scores are constant selector=None, # only relevant is sampler is given sampler=None): n_iter = int(maxiter / batch_size) assert n_iter > 0 dummy_generator = generative_models.DummyGenerator(len(x0)) if scorer is None: scorer = DummyRegressor() if sampler is None: sampler = dummy_generator if isinstance(selector, float) and 0 < selector < 1: selector = percentile_selector(selector) for i in range(n_iter): if i == 0: batch = dummy_generator.sample(batch_size) else: population = sampler.sample(population_size) scores = scorer.predict(population) batch_w_score = heapq.nsmallest(batch_size, zip(scores, population), key=lambda x: x[0]) batch = [v for score, v in batch_w_score] results = optimize_utils.score_multi(fun, batch, args, callback) selected = selector(results, batch) if selector is not None else batch scorer.fit(batch, results) sampler.fit(selected) best_fval, best_x = max(zip(results, batch), key=lambda x: x[0]) nfev = batch_size * n_iter return optimize_utils.to_result(x=best_x, fun=best_fval, niter=n_iter, nfev=nfev)
def test_scorer_sample_weight(): """Test that scorers support sample_weight or raise sensible errors""" # Unlike the metrics invariance test, in the scorer case it's harder # to ensure that, on the classifier output, weighted and unweighted # scores really should be unequal. X, y = make_classification(random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) sample_weight = np.ones_like(y_test) sample_weight[:10] = 0 # get sensible estimators for each metric sensible_regr = DummyRegressor(strategy='median') sensible_regr.fit(X_train, y_train) sensible_clf = DecisionTreeClassifier() sensible_clf.fit(X_train, y_train) estimator = dict([(name, sensible_regr) for name in REGRESSION_SCORERS] + [(name, sensible_clf) for name in CLF_SCORERS]) for name, scorer in SCORERS.items(): try: weighted = scorer(estimator[name], X_test, y_test, sample_weight=sample_weight) ignored = scorer(estimator[name], X_test[10:], y_test[10:]) unweighted = scorer(estimator[name], X_test, y_test) assert_not_equal(weighted, unweighted, "scorer {0} behaves identically when called with " "sample weights: {1} vs {2}".format(name, weighted, unweighted)) assert_equal(weighted, ignored, "scorer {0} behaves differently when ignoring " "samples and setting sample_weight to 0: " "{1} vs {2}".format(name, weighted, ignored)) except TypeError as e: assert_true("sample_weight" in str(e), "scorer {0} raises unhelpful exception when called " "with sample weights: {1}".format(name, str(e)))
def test_constant_strategy_multioutput_regressor(): random_state = np.random.RandomState(seed=1) X_learn = random_state.randn(10, 10) y_learn = random_state.randn(10, 5) # test with 2d array constants = random_state.randn(5) X_test = random_state.randn(20, 10) y_test = random_state.randn(20, 5) # Correctness oracle est = DummyRegressor(strategy="constant", constant=constants) est.fit(X_learn, y_learn) y_pred_learn = est.predict(X_learn) y_pred_test = est.predict(X_test) _check_equality_regressor(constants, y_learn, y_pred_learn, y_test, y_pred_test) _check_behavior_2d_for_constant(est)
def simplest(cube, y, cv): """ just use the mean to impute the missing values """ from sklearn.dummy import DummyRegressor clf = DummyRegressor() X = cube.reshape(cube.shape[0], cube.shape[1] * cube.shape[2]) sse = np.zeros(y.shape[1]) for train, test in cv: y_train, y_test = y[train], y[test] y_predict = clf.fit(X[train], y[train]).predict(X[test]) sse += np.mean((y_predict - y_test) ** 2, 0) return sse
def test_quantile_strategy_regressor(): random_state = np.random.RandomState(seed=1) X = [[0]] * 5 # ignored y = random_state.randn(5) reg = DummyRegressor(strategy="quantile", quantile=0.5) reg.fit(X, y) assert_array_equal(reg.predict(X), [np.median(y)] * len(X)) reg = DummyRegressor(strategy="quantile", quantile=0) reg.fit(X, y) assert_array_equal(reg.predict(X), [np.min(y)] * len(X)) reg = DummyRegressor(strategy="quantile", quantile=1) reg.fit(X, y) assert_array_equal(reg.predict(X), [np.max(y)] * len(X)) reg = DummyRegressor(strategy="quantile", quantile=0.3) reg.fit(X, y) assert_array_equal(reg.predict(X), [np.percentile(y, q=30)] * len(X))
def test_stacked_featurizer(self): data = self.make_test_data() data['y'] = [1, 2, 3] # Test for a regressor model = DummyRegressor() model.fit(self.multi.featurize_many(data['x']), data['y']) # Test the predictions f = StackedFeaturizer(self.single, model) self.assertEqual([2], f.featurize(data['x'][0])) # Test the feature names self.assertEqual(['prediction'], f.feature_labels()) f.name = 'ML' self.assertEqual(['ML prediction'], f.feature_labels()) # Test classifier model = DummyClassifier("prior") data['y'] = [0, 0, 1] model.fit(self.multi.featurize_many(data['x']), data['y']) # Test the prediction f.model = model self.assertEqual([2. / 3], f.featurize(data['x'][0])) # Test the feature labels self.assertRaises(ValueError, f.feature_labels) f.class_names = ['A', 'B'] self.assertEqual(['ML P(A)'], f.feature_labels()) # Test with three classes data['y'] = [0, 2, 1] model.fit(self.multi.featurize_many(data['x']), data['y']) self.assertArrayAlmostEqual([1. / 3] * 2, f.featurize(data['x'][0])) f.class_names = ['A', 'B', 'C'] self.assertEqual(['ML P(A)', 'ML P(B)'], f.feature_labels())
def main(): # read review data print('parsing review data...') reviews = parse_json('./yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json') # use only reviews posted after 2008 valid_reviews = [] for review in reviews: review_date = datetime.datetime.strptime(review['date'], '%Y-%m-%d') if review_date.year < 2008: continue valid_reviews.append(review) reviews = valid_reviews # sample the data # sample_num = len(reviews) # print('sampling...', sample_num, 'out of', len(reviews)) # reviews = sample(reviews, sample_num) # tokenize text for all reviews print('tokenizing text for all reviews...') texts = [review['text'] for review in reviews] count_vect = CountVectorizer(max_features = 100) X = count_vect.fit_transform(texts) # transform from occurrence to frequency print('converting occurrence to frequency...') tfidf_transformer = TfidfTransformer() X = tfidf_transformer.fit_transform(X) # load the linear model for normalization clf = joblib.load('./normalization/linear_model_for_normalization.pkl') # get labels print('calculating labels...') y = [] for review in reviews: review_date = datetime.datetime.strptime(review['date'], '%Y-%m-%d') # normalize normalizor = clf.predict(np.array([[review_date.year]]))[0][0] review_quality = sum(review['votes'].values()) / normalizor y.append(review_quality) # splitting into train and test set print('splitting into train and test set...') train_len = int(X.shape[0] * 0.6) X_train = X[:train_len, :] y_train = y[:train_len] X_test = X[train_len:, :] y_test = y[train_len:] print('train size:', X_train.shape) print('test size:', X_test.shape) # convert to polynomial features # print('converting to polynomial features...') # poly = PolynomialFeatures(2) # X_train = poly.fit_transform(X_train.toarray()) # X_test = poly.fit_transform(X_test.toarray()) # print('train set: ', X_train.shape) # print('test set: ', X_test.shape) # scale the attributes to [0, 1] print('standardizing the features...') min_max_scaler = MinMaxScaler() X_train = min_max_scaler.fit_transform(X_train) X_test = min_max_scaler.transform(X_test) # training classifiers print('training, predicting and evaluating...') # Dummy Regression (baseline model) print('\nDummy Regression:') model = DummyRegressor(strategy='mean') model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # Linear Regression print('\nLinear_regression: ') model = LinearRegression() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # Ridge print('\nRidge: ') model = Ridge() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # passive aggresive print('\nPoly: ') model = PassiveAggressiveRegressor() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # AdaBoost print('\nAdaBoost: ') model = AdaBoostRegressor() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # Random Forest print('\nRandom Forest:') model = RandomForestRegressor() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre))
def main(): # load training and testing data set print('parsing training set...') X_train, y_train = parse('./data_set/train_set.csv') print('parsing testing set...') X_test, y_test = parse('./data_set/test_set.csv') print('train set: ', X_train.shape) print('test set: ', X_test.shape) # The result turns out to be worse using non-linear polynomial regression # convert to polynomial features # print('converting to polynomial features...') # poly = PolynomialFeatures(2) # X_train = poly.fit_transform(X_train) # X_test = poly.fit_transform(X_test) # print('train set: ', X_train.shape) # print('test set: ', X_test.shape) # scale the attributes to [0, 1] print('standardizing the features...') min_max_scaler = MinMaxScaler() X_train = min_max_scaler.fit_transform(X_train) X_test = min_max_scaler.transform(X_test) # training classifiers print('training, predicting and evaluating...') # Dummy Regression (baseline model) print('\nDummy Regression: (baseline)') model = DummyRegressor(strategy='mean') model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # Linear Regression print('\nLinear_regression: ') model = LinearRegression() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # KNN Regression # print('\nKNN Regression: ') # model = KNeighborsRegressor() # model.fit(X_train, y_train) # y_pre = model.predict(X_test) # print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) # print('r2_score: ', r2_score(y_test, y_pre)) # Neural Network - Bernoulli Restricted Boltzmann Machine (RBM) # print('\nNeural Network - RBM: ') # model = BernoulliRBM() # model.fit(X_train, y_train) # y_pre = model.predict(X_test) # print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) # print('r2_score: ', r2_score(y_test, y_pre)) # AdaBoost print('\nAdaBoost: ') model = AdaBoostRegressor() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # Random Forest print('\nRandom Forest:') model = RandomForestRegressor() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre))
def test_regressor_score_with_None(y, y_test): reg = DummyRegressor() reg.fit(None, y) assert_equal(reg.score(None, y_test), 1.0)
X_fmri_train = X_fmri[train] X_fmri_test = X_fmri[test] X_meg_train = X_meg[train] X_meg_test = X_meg[test] y_train = y[train] y_test = y[test] X_train = np.hstack([X_fmri_train,X_meg_train]) X_test = np.hstack([X_fmri_test,X_meg_test]) pls.fit(X_train, y_train) pred = pls.predict(X_test) mae += mean_absolute_error(y_test, pred) dumb.fit(X_train, y_train) dumb_pred = dumb.predict(X_test) dumb_mae += mean_absolute_error(y_test,dumb_pred) if within: pls.fit(X_fmri_train, y_train) pred = pls.predict(X_fmri_test) fmri_mae += mean_absolute_error(y_test, pred) pls.fit(X_meg_train, y_train) pred = pls.predict(X_meg_test) meg_mae += mean_absolute_error(y_test, pred) comp_scores.append(mae/nfolds) dumb_scores.append(dumb_mae/nfolds) fmri_scores.append(fmri_mae/nfolds)
pls = PLSRegression(n_components=ncomp) dumb = DummyRegressor(strategy='mean') mae = 0 dumb_mae = 0 for oidx, (train, test) in enumerate(cv): X_fmri_train = X_fmri[train] X_fmri_test = X_fmri[test] X_meg_train = X_meg[train] X_meg_test = X_meg[test] pls.fit(X_fmri_train, X_meg_train) pred = pls.predict(X_fmri_test) mae += mean_absolute_error(X_meg_test, pred) dumb.fit(X_fmri_train, X_meg_train) dumb_pred = dumb.predict(X_fmri_test) dumb_mae += mean_absolute_error(X_meg_test,dumb_pred) comp_scores.append(mae/nfolds) dumb_scores.append(dumb_mae/nfolds) import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt plt.plot(max_comps,comp_scores,max_comps,dumb_scores) t_str = seed + str(band) plt.title(t_str) plt.savefig(home+'/tmp/meg_fmri_%s_%s.png'%(seed,band[0]))