示例#1
0
def mean_model(features, solutions, verbose=0):
    columns = solutions.columns
    clf = DummyRegressor()
    print('Training Model... ')
    clf.fit(features, solutions)
    print('Done Training')
    return (clf, columns)
示例#2
0
def test_regressor():
    X = [[0]] * 4  # ignored
    y = [1, 2, 1, 1]

    reg = DummyRegressor()
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [5. / 4] * len(X))
示例#3
0
def test_quantile_strategy_multioutput_regressor():

    random_state = np.random.RandomState(seed=1)

    X_learn = random_state.randn(10, 10)
    y_learn = random_state.randn(10, 5)

    median = np.median(y_learn, axis=0).reshape((1, -1))
    quantile_values = np.percentile(y_learn, axis=0, q=80).reshape((1, -1))

    X_test = random_state.randn(20, 10)
    y_test = random_state.randn(20, 5)

    # Correctness oracle
    est = DummyRegressor(strategy="quantile", quantile=0.5)
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    _check_equality_regressor(
        median, y_learn, y_pred_learn, y_test, y_pred_test)
    _check_behavior_2d(est)

    # Correctness oracle
    est = DummyRegressor(strategy="quantile", quantile=0.8)
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    _check_equality_regressor(
        quantile_values, y_learn, y_pred_learn, y_test, y_pred_test)
    _check_behavior_2d(est)
示例#4
0
def the_mocked_model():

    X, y = make_regression(n_samples=500, n_features=2)
    model = DummyRegressor()
    model.fit(X, y)

    return model
def dumb_regressor_result(x_test, x_train, y_test, y_train):
    """
          Dumb regressor, predict only the mean value for each target variable,
          returns MAE and MSE metrics per each variable.

          Args:
            x_test: validation samples
            x_train: training samples
            y_test: validation target
            y_train: training target

          Returns:
            dumb_metrics: list of metrics results after dumb regression
    """
    dumb_reg = DummyRegressor()
    fake_data = np.zeros((x_train.shape[0], 1))
    fake_test = np.zeros((1, 1))
    dumb_reg.fit(fake_data, y_train)
    dumb_pred = dumb_reg.predict(fake_test)[0]
    dumb_metrics = []
    for i in range(dumb_pred.size):
        dumb_pred_var = np.full((x_test.shape[0], 1), dumb_pred[i])
        dumb_mse_var = mean_squared_error(y_test[:, i], dumb_pred_var)
        dumb_mae_var = mean_absolute_error(y_test[:, i], dumb_pred_var)
        dumb_metrics.append([dumb_mse_var, dumb_mae_var])
    return dumb_metrics
示例#6
0
def test_y_mean_attribute_regressor():
    X = [[0]] * 5
    y = [1, 2, 4, 6, 8]
    # when strategy = 'mean'
    est = DummyRegressor(strategy='mean')
    est.fit(X, y)
    assert_equal(est.y_mean_, np.mean(y))
示例#7
0
def test_regressor():
    X = [[0]] * 4  # ignored
    y = [1, 2, 1, 1]

    reg = DummyRegressor()
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [5. / 4] * len(X))
示例#8
0
def test_weights_regressor():
    """Check weighted average regression prediction on boston dataset."""
    reg1 = DummyRegressor(strategy='mean')
    reg2 = DummyRegressor(strategy='median')
    reg3 = DummyRegressor(strategy='quantile', quantile=.2)
    ereg = VotingRegressor([('mean', reg1), ('median', reg2),
                            ('quantile', reg3)], weights=[1, 2, 10])

    X_r_train, X_r_test, y_r_train, y_r_test = \
        train_test_split(X_r, y_r, test_size=.25)

    reg1_pred = reg1.fit(X_r_train, y_r_train).predict(X_r_test)
    reg2_pred = reg2.fit(X_r_train, y_r_train).predict(X_r_test)
    reg3_pred = reg3.fit(X_r_train, y_r_train).predict(X_r_test)
    ereg_pred = ereg.fit(X_r_train, y_r_train).predict(X_r_test)

    avg = np.average(np.asarray([reg1_pred, reg2_pred, reg3_pred]), axis=0,
                     weights=[1, 2, 10])
    assert_almost_equal(ereg_pred, avg, decimal=2)

    ereg_weights_none = VotingRegressor([('mean', reg1), ('median', reg2),
                                         ('quantile', reg3)], weights=None)
    ereg_weights_equal = VotingRegressor([('mean', reg1), ('median', reg2),
                                          ('quantile', reg3)],
                                         weights=[1, 1, 1])
    ereg_weights_none.fit(X_r_train, y_r_train)
    ereg_weights_equal.fit(X_r_train, y_r_train)
    ereg_none_pred = ereg_weights_none.predict(X_r_test)
    ereg_equal_pred = ereg_weights_equal.predict(X_r_test)
    assert_almost_equal(ereg_none_pred, ereg_equal_pred, decimal=2)
def train_classifier():
	X_train = tfv.transform(video_captions_train)
	X_test  = tfv.transform(video_captions_test)
	
	dummy = DummyRegressor(strategy="median")
	dummy.fit(X_train, Y_train)
	Y_pred_med = dummy.predict(X_test)
示例#10
0
def test_constants_not_specified_regressor():
    X = [[0]] * 5
    y = [1, 2, 4, 6, 8]

    est = DummyRegressor(strategy='constant')
    with pytest.raises(TypeError):
        est.fit(X, y)
示例#11
0
def test_quantile_strategy_multioutput_regressor():

    random_state = np.random.RandomState(seed=1)

    X_learn = random_state.randn(10, 10)
    y_learn = random_state.randn(10, 5)

    median = np.median(y_learn, axis=0).reshape((1, -1))
    quantile_values = np.percentile(y_learn, axis=0, q=80).reshape((1, -1))

    X_test = random_state.randn(20, 10)
    y_test = random_state.randn(20, 5)

    # Correctness oracle
    est = DummyRegressor(strategy="quantile", quantile=0.5)
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    _check_equality_regressor(median, y_learn, y_pred_learn, y_test,
                              y_pred_test)
    _check_behavior_2d(est)

    # Correctness oracle
    est = DummyRegressor(strategy="quantile", quantile=0.8)
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    _check_equality_regressor(quantile_values, y_learn, y_pred_learn, y_test,
                              y_pred_test)
    _check_behavior_2d(est)
示例#12
0
def train_intelligence(dataframe, text_column, classification_column):
    vectorizer = TfidfVectorizer(lowercase=False)
    bag_of_words = vectorizer.fit_transform(dataframe[text_column])

    train, test, class_train, class_test = train_test_split(
        bag_of_words,
        dataframe[classification_column],
        random_state=42,
        test_size=0.25)

    logistic_regression = LogisticRegression()
    logistic_regression.fit(train, class_train)

    pesos = pd.DataFrame(logistic_regression.coef_[0].T,
                         index=vectorizer.get_feature_names())
    print(pesos.nlargest(10, 0))
    print(pesos.nsmallest(10, 0))

    filename = 'anton_brain.sav'
    pickle.dump(logistic_regression, open(filename, 'wb'))

    filename = 'anton_vectorizer.sav'
    pickle.dump(vectorizer, open(filename, 'wb'))

    ###### baseline ######
    baseline = DummyRegressor(strategy="mean")
    baseline.fit(train, class_train)
    print('Baseline Accuracy: ')
    print(round(baseline.predict(class_test)[0] * 100, 2))
    ###### baseline ######

    print('Algorithm Accuracy:')
    print(round(logistic_regression.score(test, class_test) * 100, 2))
    return
示例#13
0
def test_weights_regressor():
    """Check weighted average regression prediction on diabetes dataset."""
    reg1 = DummyRegressor(strategy="mean")
    reg2 = DummyRegressor(strategy="median")
    reg3 = DummyRegressor(strategy="quantile", quantile=0.2)
    ereg = VotingRegressor(
        [("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=[1, 2, 10]
    )

    X_r_train, X_r_test, y_r_train, y_r_test = train_test_split(
        X_r, y_r, test_size=0.25
    )

    reg1_pred = reg1.fit(X_r_train, y_r_train).predict(X_r_test)
    reg2_pred = reg2.fit(X_r_train, y_r_train).predict(X_r_test)
    reg3_pred = reg3.fit(X_r_train, y_r_train).predict(X_r_test)
    ereg_pred = ereg.fit(X_r_train, y_r_train).predict(X_r_test)

    avg = np.average(
        np.asarray([reg1_pred, reg2_pred, reg3_pred]), axis=0, weights=[1, 2, 10]
    )
    assert_almost_equal(ereg_pred, avg, decimal=2)

    ereg_weights_none = VotingRegressor(
        [("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=None
    )
    ereg_weights_equal = VotingRegressor(
        [("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=[1, 1, 1]
    )
    ereg_weights_none.fit(X_r_train, y_r_train)
    ereg_weights_equal.fit(X_r_train, y_r_train)
    ereg_none_pred = ereg_weights_none.predict(X_r_test)
    ereg_equal_pred = ereg_weights_equal.predict(X_r_test)
    assert_almost_equal(ereg_none_pred, ereg_equal_pred, decimal=2)
示例#14
0
class MeanRegressor(BaseEstimator):
    def __init__(self):
        """
        Model predicting the mean value of the target label
        """
        self.reg = DummyRegressor(strategy='mean')

    def fit(self, X, y):
        """
        Fits the model
        :param X: input dataframe
        :param y: labels dataframe
        :return:
        """
        X = X['user_id']
        self.reg.fit(X, y)

    def predict(self, X):
        """
        Predicts the values with given input data
        :param X: input dataframe
        :return: list of predicted values
        """
        X = X['user_id']
        return self.reg.predict(X)
示例#15
0
def test_y_mean_attribute_regressor():
    X = [[0]] * 5
    y = [1, 2, 4, 6, 8]
    # when strategy = 'mean'
    est = DummyRegressor(strategy='mean')
    est.fit(X, y)
    assert_equal(est.y_mean_, np.mean(y))
示例#16
0
def test_unknown_strategey_regressor():
    X = [[0]] * 5
    y = [1, 2, 4, 6, 8]

    est = DummyRegressor(strategy='gona')
    with pytest.raises(ValueError):
        est.fit(X, y)
示例#17
0
class GradientBoosting:
    def __init__(self,
                 n_estimators=100,
                 learning_rate=0.1,
                 subsamples=1.0,
                 max_features=None,
                 max_depth=3,
                 min_leaf_size=1,
                 init=None):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.subsamples = subsamples
        self.max_features = max_features
        self.max_depth = max_depth
        self.min_leaf_size = min_leaf_size
        if init is None:
            self.init_model = DummyRegressor()
        else:
            self.init_model = init
        self.estimators = []
        self.weights = []
        self.loss_by_iter = []

    def fit(self, X, y):
        if self.init_model == 'zeros':
            pred = np.zeros(y.shape)
        else:
            self.init_model.fit(X, y)
            pred = self.init_model.predict(X)
        res = pred.copy()
        self.loss_by_iter.append(mse(res, y))

        for i in tqdm(range(self.n_estimators)):
            grad = (y - res)

            cur_estimator = CART(max_depth=self.max_depth,
                                 min_leaf_size=self.min_leaf_size,
                                 max_features=self.max_features)
            if self.subsamples < 1.0:
                sample_ids = np.arange(y.shape[0])
                np.random.shuffle(sample_ids)
                sample_ids = sample_ids[:int(y.shape[0] * self.subsamples)]
                cur_estimator.fit(X[sample_ids], grad[sample_ids])
            else:
                cur_estimator.fit(X, grad)
            self.estimators.append(cur_estimator)
            pred = cur_estimator.predict(X)
            # b = golden_section(res, pred, y)
            b = 1
            self.weights.append(b)
            res += self.learning_rate * b * pred
            self.loss_by_iter.append(mse(res, y))

    def predict(self, X):
        pred = self.init_model.predict(X)
        for i in range(self.n_estimators):
            pred += self.learning_rate * self.weights[i] * self.estimators[
                i].predict(X)
        return pred
示例#18
0
def test_dummy_regressor_on_nan_value():
    X = [[np.NaN]]
    y = [1]
    y_expected = [1]
    clf = DummyRegressor()
    clf.fit(X, y)
    y_pred = clf.predict(X)
    assert_array_equal(y_pred, y_expected)
def dummy_regressor(X, y, args={}):
    """
    用于使用随机预测得到的结果,用于和其他模型进行对比
    """
    from sklearn.dummy import DummyRegressor
    clf = DummyRegressor(**args)
    clf.fit(X, y)
    return clf
示例#20
0
def test_constants_not_specified_regressor():
    X = [[0]] * 5
    y = [1, 2, 4, 6, 8]

    est = DummyRegressor(strategy="constant")
    err_msg = "Constant target value has to be specified"
    with pytest.raises(TypeError, match=err_msg):
        est.fit(X, y)
示例#21
0
def dummy_regressor(X_train, X_test, y_train, y_test):
    dummy_clf = DummyRegressor()
    dummy_clf.fit(X_train, y_train)
    dummy_pred = dummy_clf.predict(X_test)

    # Evaluate the root mean squared error
    calculated_error = np.sqrt(mean_squared_error(y_test, dummy_pred))
    return round(calculated_error,3)
def test_dummy_regressor_on_nan_value():
    X = [[np.NaN]]
    y = [1]
    y_expected = [1]
    clf = DummyRegressor()
    clf.fit(X, y)
    y_pred = clf.predict(X)
    assert_array_equal(y_pred, y_expected)
示例#23
0
def test_y_mean_attribute_regressor():
    X = [[0]] * 5
    y = [1, 2, 4, 6, 8]
    # when strategy = 'mean'
    est = DummyRegressor(strategy="mean")
    est.fit(X, y)

    assert est.constant_ == np.mean(y)
示例#24
0
def test_dummy_regressor_on_3D_array():
    X = np.array([[['foo']], [['bar']], [['baz']]])
    y = np.array([2, 2, 2])
    y_expected = np.array([2, 2, 2])
    cls = DummyRegressor()
    cls.fit(X, y)
    y_pred = cls.predict(X)
    assert_array_equal(y_pred, y_expected)
示例#25
0
def test_constant_size_multioutput_regressor():
    random_state = np.random.RandomState(seed=1)
    X = random_state.randn(10, 10)
    y = random_state.randn(10, 5)

    est = DummyRegressor(strategy="constant", constant=[1, 2, 3, 4])
    with pytest.raises(ValueError):
        est.fit(X, y)
示例#26
0
def test_dummy_regressor_on_3D_array():
    X = np.array([[['foo']], [['bar']], [['baz']]])
    y = np.array([2, 2, 2])
    y_expected = np.array([2, 2, 2])
    cls = DummyRegressor()
    cls.fit(X, y)
    y_pred = cls.predict(X)
    assert_array_equal(y_pred, y_expected)
示例#27
0
class Regressor(BaseEstimator):
    def __init__(self):
        self.clf = DummyRegressor()

    def fit(self, X, y):
        self.clf.fit(X, y)

    def predict(self, X):
        return self.clf.predict(X)
示例#28
0
    def execute(self):
        from sklearn.dummy import DummyRegressor

        dummy_regr = DummyRegressor(strategy="mean")
        dummy_regr.fit(self.partitions.x_train, self.partitions.y_train)
        y_pred = dummy_regr.predict(self.partitions.x_test)
        self.y_pred = y_pred

        return self.y_pred, self.partitions.y_test
def main():

    X_train = pd.read_csv("data/X_train.csv")
    y_train = pd.read_csv("data/y_train.csv")

    dummy_reg = DummyRegressor()
    dummy_reg.fit(X_train, y_train)

    pickle.dump(dummy_reg, open("models/dummy_reg.pkl", 'wb'))
示例#30
0
def dummy_regressor_accuracy(x,
                             y,
                             evaluator: Callable,
                             strategy: str = 'mean'):
    dummy = DummyRegressor(strategy)
    dummy.fit(x, y)
    y_hat = dummy.predict(x)
    print('DummyRegressor accuracy:', evaluator(y_hat, y))
    return dummy
示例#31
0
def test_median_strategy_regressor():
    random_state = np.random.RandomState(seed=1)

    X = [[0]] * 5  # ignored
    y = random_state.randn(5)

    reg = DummyRegressor(strategy="median")
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.median(y)] * len(X))
示例#32
0
def test_scorer_sample_weight():
    """Test that scorers support sample_weight or raise sensible errors"""

    # Unlike the metrics invariance test, in the scorer case it's harder
    # to ensure that, on the classifier output, weighted and unweighted
    # scores really should be unequal.
    X, y = make_classification(random_state=0)
    _, y_ml = make_multilabel_classification(n_samples=X.shape[0],
                                             return_indicator=True,
                                             random_state=0)
    split = train_test_split(X, y, y_ml, random_state=0)
    X_train, X_test, y_train, y_test, y_ml_train, y_ml_test = split

    sample_weight = np.ones_like(y_test)
    sample_weight[:10] = 0

    # get sensible estimators for each metric
    sensible_regr = DummyRegressor(strategy='median')
    sensible_regr.fit(X_train, y_train)
    sensible_clf = DecisionTreeClassifier(random_state=0)
    sensible_clf.fit(X_train, y_train)
    sensible_ml_clf = DecisionTreeClassifier(random_state=0)
    sensible_ml_clf.fit(X_train, y_ml_train)
    estimator = dict([(name, sensible_regr) for name in REGRESSION_SCORERS] +
                     [(name, sensible_clf) for name in CLF_SCORERS] +
                     [(name, sensible_ml_clf)
                      for name in MULTILABEL_ONLY_SCORERS])

    for name, scorer in SCORERS.items():
        if name in MULTILABEL_ONLY_SCORERS:
            target = y_ml_test
        else:
            target = y_test
        try:
            weighted = scorer(estimator[name],
                              X_test,
                              target,
                              sample_weight=sample_weight)
            ignored = scorer(estimator[name], X_test[10:], target[10:])
            unweighted = scorer(estimator[name], X_test, target)
            assert_not_equal(weighted,
                             unweighted,
                             msg="scorer {0} behaves identically when "
                             "called with sample weights: {1} vs "
                             "{2}".format(name, weighted, unweighted))
            assert_almost_equal(weighted,
                                ignored,
                                err_msg="scorer {0} behaves differently when "
                                "ignoring samples and setting sample_weight to"
                                " 0: {1} vs {2}".format(
                                    name, weighted, ignored))

        except TypeError as e:
            assert_true(
                "sample_weight" in str(e),
                "scorer {0} raises unhelpful exception when called "
                "with sample weights: {1}".format(name, str(e)))
def train_dummy_regressors(features, target):
    for strat in ['mean', 'median']:
        dr = DummyRegressor(strategy=strat)
        dr.fit(features, y=target.flatten())

        dummy_score = (100 * dr.score(features, target))
        print('{:.1f} % score for a dummy regressor using the {} stragety'.format(
	    dummy_score,
	    dr.get_params()['strategy']))
示例#34
0
def test_constant_size_multioutput_regressor():
    random_state = np.random.RandomState(seed=1)
    X = random_state.randn(10, 10)
    y = random_state.randn(10, 5)

    est = DummyRegressor(strategy="constant", constant=[1, 2, 3, 4])
    err_msg = r"Constant target value should have shape \(5, 1\)."
    with pytest.raises(ValueError, match=err_msg):
        est.fit(X, y)
示例#35
0
def test_scorer_sample_weight():
    # Test that scorers support sample_weight or raise sensible errors

    # Unlike the metrics invariance test, in the scorer case it's harder
    # to ensure that, on the classifier output, weighted and unweighted
    # scores really should be unequal.
    X, y = make_classification(random_state=0)
    _, y_ml = make_multilabel_classification(n_samples=X.shape[0], random_state=0)
    split = train_test_split(X, y, y_ml, random_state=0)
    X_train, X_test, y_train, y_test, y_ml_train, y_ml_test = split

    sample_weight = np.ones_like(y_test)
    sample_weight[:10] = 0

    # get sensible estimators for each metric
    sensible_regr = DummyRegressor(strategy="median")
    sensible_regr.fit(X_train, y_train)
    sensible_clf = DecisionTreeClassifier(random_state=0)
    sensible_clf.fit(X_train, y_train)
    sensible_ml_clf = DecisionTreeClassifier(random_state=0)
    sensible_ml_clf.fit(X_train, y_ml_train)
    estimator = dict(
        [(name, sensible_regr) for name in REGRESSION_SCORERS]
        + [(name, sensible_clf) for name in CLF_SCORERS]
        + [(name, sensible_ml_clf) for name in MULTILABEL_ONLY_SCORERS]
    )

    for name, scorer in SCORERS.items():
        if name in MULTILABEL_ONLY_SCORERS:
            target = y_ml_test
        else:
            target = y_test
        try:
            weighted = scorer(estimator[name], X_test, target, sample_weight=sample_weight)
            ignored = scorer(estimator[name], X_test[10:], target[10:])
            unweighted = scorer(estimator[name], X_test, target)
            assert_not_equal(
                weighted,
                unweighted,
                msg="scorer {0} behaves identically when "
                "called with sample weights: {1} vs "
                "{2}".format(name, weighted, unweighted),
            )
            assert_almost_equal(
                weighted,
                ignored,
                err_msg="scorer {0} behaves differently when "
                "ignoring samples and setting sample_weight to"
                " 0: {1} vs {2}".format(name, weighted, ignored),
            )

        except TypeError as e:
            assert_true(
                "sample_weight" in str(e),
                "scorer {0} raises unhelpful exception when called " "with sample weights: {1}".format(name, str(e)),
            )
示例#36
0
def test_median_strategy_regressor():

    random_state = np.random.RandomState(seed=1)

    X = [[0]] * 5  # ignored
    y = random_state.randn(5)

    reg = DummyRegressor(strategy="median")
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.median(y)] * len(X))
示例#37
0
class DummyEstimator(BaseTesterEstimator):

    def __init__(self):
        self.regressor = DummyRegressor()

    def fit(self, x, y):
        self.regressor.fit(x, y)

    def predict(self, x):
        return self.regressor.predict(x)
示例#38
0
def test_dummy_regressor_return_std():
    X = [[0]] * 3  # ignored
    y = np.array([2, 2, 2])
    y_std_expected = np.array([0, 0, 0])
    cls = DummyRegressor()
    cls.fit(X, y)
    y_pred_list = cls.predict(X, return_std=True)
    # there should be two elements when return_std is True
    assert len(y_pred_list) == 2
    # the second element should be all zeros
    assert_array_equal(y_pred_list[1], y_std_expected)
示例#39
0
def test_quantile_invalid():

    X = [[0]] * 5  # ignored
    y = [0] * 5  # ignored

    est = DummyRegressor(strategy="quantile", quantile=None)
    err_msg = (
        "When using `strategy='quantile', you have to specify the desired quantile"
    )
    with pytest.raises(ValueError, match=err_msg):
        est.fit(X, y)
示例#40
0
def test_dummy_regressor_return_std():
    X = [[0]] * 3  # ignored
    y = np.array([2, 2, 2])
    y_std_expected = np.array([0, 0, 0])
    cls = DummyRegressor()
    cls.fit(X, y)
    y_pred_list = cls.predict(X, return_std=True)
    # there should be two elements when return_std is True
    assert_equal(len(y_pred_list), 2)
    # the second element should be all zeros
    assert_array_equal(y_pred_list[1], y_std_expected)
示例#41
0
def train_average_predictor(journeys: DataFrame, last: str,
                            target: str) -> Optional[Any]:
    journeys = journeys[pd.notnull(journeys[target])
                        & pd.notnull(journeys[last])]
    y = travel_times(journeys, [], last,
                     target).astype("int64") / 1_000_000_000
    if len(y) > 0:
        predictor = DummyRegressor(strategy="median")
        predictor.fit(journeys, y)
        return predictor
    else:
        return None
示例#42
0
def baseline():
    from sklearn.dummy import DummyRegressor
    baseline = DummyRegressor(strategy='mean')
    baseline.fit(X_train_scaled, y_train)
    y_pred_train = baseline.predict(X_train_scaled)
    #y_pred_train_round = np.round(y_pred_train)
    y_pred_test = baseline.predict(X_test_scaled)
    #y_pred_test_round = np.round(y_pred_test)
    print(r2_score(y_test, y_pred_test))
    #print (lm.score(X_test_scaled, y_test))
    #plot_conf_mat(y_test, y_pred_round)
    return scores_results(y_train, y_test, y_pred_train, y_pred_test)
示例#43
0
def test_regressor_prediction_independent_of_X(strategy):
    y = [0, 2, 1, 1]
    X1 = [[0]] * 4
    reg1 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7)
    reg1.fit(X1, y)
    predictions1 = reg1.predict(X1)

    X2 = [[1]] * 4
    reg2 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7)
    reg2.fit(X2, y)
    predictions2 = reg2.predict(X2)

    assert_array_equal(predictions1, predictions2)
示例#44
0
def _make_estimators(X_train, y_train, y_ml_train):
    # Make estimators that make sense to test various scoring methods
    sensible_regr = DummyRegressor(strategy='median')
    sensible_regr.fit(X_train, y_train)
    sensible_clf = DecisionTreeClassifier(random_state=0)
    sensible_clf.fit(X_train, y_train)
    sensible_ml_clf = DecisionTreeClassifier(random_state=0)
    sensible_ml_clf.fit(X_train, y_ml_train)
    return dict(
        [(name, sensible_regr) for name in REGRESSION_SCORERS] +
        [(name, sensible_clf) for name in CLF_SCORERS] +
        [(name, sensible_ml_clf) for name in MULTILABEL_ONLY_SCORERS]
    )
示例#45
0
def test_constant_strategy_regressor():

    random_state = np.random.RandomState(seed=1)

    X = [[0]] * 5  # ignored
    y = random_state.randn(5)

    reg = DummyRegressor(strategy="constant", constant=[43])
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [43] * len(X))

    reg = DummyRegressor(strategy="constant", constant=43)
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [43] * len(X))
示例#46
0
def test_multioutput_regressor():

    X_learn = np.random.randn(10, 10)
    y_learn = np.random.randn(10, 5)

    mean = np.mean(y_learn, axis=0).reshape((1, -1))

    X_test = np.random.randn(20, 10)
    y_test = np.random.randn(20, 5)

    # Correctness oracle
    est = DummyRegressor()
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    assert_array_equal(np.tile(mean, (y_learn.shape[0], 1)), y_pred_learn)
    assert_array_equal(np.tile(mean, (y_test.shape[0], 1)), y_pred_test)
    _check_behavior_2d(est)
示例#47
0
def test_mean_strategy_multioutput_regressor():

    random_state = np.random.RandomState(seed=1)

    X_learn = random_state.randn(10, 10)
    y_learn = random_state.randn(10, 5)

    mean = np.mean(y_learn, axis=0).reshape((1, -1))

    X_test = random_state.randn(20, 10)
    y_test = random_state.randn(20, 5)

    # Correctness oracle
    est = DummyRegressor()
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    _check_equality_regressor(mean, y_learn, y_pred_learn, y_test, y_pred_test)
    _check_behavior_2d(est)
示例#48
0
def _minimize_simbo_general(fun,
                            x0,  # only used to get number of features
                            args=(),
                            callback=None,
                            batch_size=100,
                            population_size=10000,
                            maxiter=10000,
                            scorer=None, # if no scorer given, scores are constant
                            selector=None, # only relevant is sampler is given
                            sampler=None):
    n_iter = int(maxiter / batch_size)
    assert n_iter > 0

    dummy_generator = generative_models.DummyGenerator(len(x0))

    if scorer is None:
        scorer = DummyRegressor()
    if sampler is None:
        sampler = dummy_generator

    if isinstance(selector, float) and 0 < selector < 1:
        selector = percentile_selector(selector)

    for i in range(n_iter):
        if i == 0:
            batch = dummy_generator.sample(batch_size)
        else:
            population = sampler.sample(population_size)
            scores = scorer.predict(population)
            batch_w_score = heapq.nsmallest(batch_size, zip(scores, population),
                                            key=lambda x: x[0])
            batch = [v for score, v in batch_w_score]
        results = optimize_utils.score_multi(fun, batch, args, callback)
        selected = selector(results, batch) if selector is not None else batch
        scorer.fit(batch, results)
        sampler.fit(selected)

    best_fval, best_x = max(zip(results, batch), key=lambda x: x[0])
    nfev = batch_size * n_iter
    return optimize_utils.to_result(x=best_x, fun=best_fval,
                                    niter=n_iter, nfev=nfev)
def test_scorer_sample_weight():
    """Test that scorers support sample_weight or raise sensible errors"""

    # Unlike the metrics invariance test, in the scorer case it's harder
    # to ensure that, on the classifier output, weighted and unweighted
    # scores really should be unequal.
    X, y = make_classification(random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    sample_weight = np.ones_like(y_test)
    sample_weight[:10] = 0

    # get sensible estimators for each metric
    sensible_regr = DummyRegressor(strategy='median')
    sensible_regr.fit(X_train, y_train)
    sensible_clf = DecisionTreeClassifier()
    sensible_clf.fit(X_train, y_train)
    estimator = dict([(name, sensible_regr)
                      for name in REGRESSION_SCORERS] +
                     [(name, sensible_clf)
                      for name in CLF_SCORERS])

    for name, scorer in SCORERS.items():
        try:
            weighted = scorer(estimator[name], X_test, y_test,
                              sample_weight=sample_weight)
            ignored = scorer(estimator[name], X_test[10:], y_test[10:])
            unweighted = scorer(estimator[name], X_test, y_test)
            assert_not_equal(weighted, unweighted,
                             "scorer {0} behaves identically when called with "
                             "sample weights: {1} vs {2}".format(name,
                                                                 weighted,
                                                                 unweighted))
            assert_equal(weighted, ignored,
                         "scorer {0} behaves differently when ignoring "
                         "samples and setting sample_weight to 0: "
                         "{1} vs {2}".format(name, weighted, ignored))

        except TypeError as e:
            assert_true("sample_weight" in str(e),
                        "scorer {0} raises unhelpful exception when called "
                        "with sample weights: {1}".format(name, str(e)))
示例#50
0
def test_constant_strategy_multioutput_regressor():

    random_state = np.random.RandomState(seed=1)

    X_learn = random_state.randn(10, 10)
    y_learn = random_state.randn(10, 5)

    # test with 2d array
    constants = random_state.randn(5)

    X_test = random_state.randn(20, 10)
    y_test = random_state.randn(20, 5)

    # Correctness oracle
    est = DummyRegressor(strategy="constant", constant=constants)
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    _check_equality_regressor(constants, y_learn, y_pred_learn, y_test, y_pred_test)
    _check_behavior_2d_for_constant(est)
示例#51
0
def simplest(cube, y, cv):
    """ just use the mean to impute the missing values
    """
    from sklearn.dummy import DummyRegressor
    clf = DummyRegressor()
    X = cube.reshape(cube.shape[0], cube.shape[1] * cube.shape[2])
    sse = np.zeros(y.shape[1])
    for train, test in cv:
        y_train, y_test = y[train], y[test]
        y_predict = clf.fit(X[train], y[train]).predict(X[test])
        sse += np.mean((y_predict - y_test) ** 2, 0)
    return sse
示例#52
0
def test_quantile_strategy_regressor():

    random_state = np.random.RandomState(seed=1)

    X = [[0]] * 5  # ignored
    y = random_state.randn(5)

    reg = DummyRegressor(strategy="quantile", quantile=0.5)
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.median(y)] * len(X))

    reg = DummyRegressor(strategy="quantile", quantile=0)
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.min(y)] * len(X))

    reg = DummyRegressor(strategy="quantile", quantile=1)
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.max(y)] * len(X))

    reg = DummyRegressor(strategy="quantile", quantile=0.3)
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.percentile(y, q=30)] * len(X))
示例#53
0
    def test_stacked_featurizer(self):
        data = self.make_test_data()
        data['y'] = [1, 2, 3]

        # Test for a regressor
        model = DummyRegressor()
        model.fit(self.multi.featurize_many(data['x']), data['y'])

        #  Test the predictions
        f = StackedFeaturizer(self.single, model)
        self.assertEqual([2], f.featurize(data['x'][0]))

        #  Test the feature names
        self.assertEqual(['prediction'], f.feature_labels())
        f.name = 'ML'
        self.assertEqual(['ML prediction'], f.feature_labels())

        # Test classifier
        model = DummyClassifier("prior")
        data['y'] = [0, 0, 1]
        model.fit(self.multi.featurize_many(data['x']), data['y'])

        #  Test the prediction
        f.model = model
        self.assertEqual([2. / 3], f.featurize(data['x'][0]))

        #  Test the feature labels
        self.assertRaises(ValueError, f.feature_labels)
        f.class_names = ['A', 'B']
        self.assertEqual(['ML P(A)'], f.feature_labels())

        # Test with three classes
        data['y'] = [0, 2, 1]
        model.fit(self.multi.featurize_many(data['x']), data['y'])

        self.assertArrayAlmostEqual([1. / 3] * 2, f.featurize(data['x'][0]))
        f.class_names = ['A', 'B', 'C']
        self.assertEqual(['ML P(A)', 'ML P(B)'], f.feature_labels())
def main():

    # read review data
    print('parsing review data...')
    reviews = parse_json('./yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json')

    # use only reviews posted after 2008
    valid_reviews = []
    for review in reviews:
        review_date = datetime.datetime.strptime(review['date'], '%Y-%m-%d')
        if review_date.year < 2008: 
            continue
        valid_reviews.append(review)
    reviews = valid_reviews

    # sample the data
    # sample_num = len(reviews)
    # print('sampling...', sample_num, 'out of', len(reviews))
    # reviews = sample(reviews, sample_num)

    # tokenize text for all reviews
    print('tokenizing text for all reviews...')
    texts = [review['text'] for review in reviews]
    count_vect = CountVectorizer(max_features = 100)
    X = count_vect.fit_transform(texts)

    # transform from occurrence to frequency
    print('converting occurrence to frequency...')
    tfidf_transformer = TfidfTransformer()
    X = tfidf_transformer.fit_transform(X)

    # load the linear model for normalization
    clf = joblib.load('./normalization/linear_model_for_normalization.pkl')

    # get labels
    print('calculating labels...')
    y = []
    for review in reviews:
        review_date = datetime.datetime.strptime(review['date'], '%Y-%m-%d')
        # normalize
        normalizor = clf.predict(np.array([[review_date.year]]))[0][0]
        review_quality = sum(review['votes'].values()) / normalizor
        y.append(review_quality)

    # splitting into train and test set
    print('splitting into train and test set...')
    train_len = int(X.shape[0] * 0.6)
    X_train = X[:train_len, :]
    y_train = y[:train_len]
    X_test = X[train_len:, :]
    y_test = y[train_len:]
    print('train size:', X_train.shape)
    print('test size:', X_test.shape)

    # convert to polynomial features
    # print('converting to polynomial features...')
    # poly = PolynomialFeatures(2)
    # X_train = poly.fit_transform(X_train.toarray())
    # X_test = poly.fit_transform(X_test.toarray())
    # print('train set: ', X_train.shape)
    # print('test set: ', X_test.shape)

    # scale the attributes to [0, 1]
    print('standardizing the features...')
    min_max_scaler = MinMaxScaler()
    X_train = min_max_scaler.fit_transform(X_train)
    X_test = min_max_scaler.transform(X_test)

    # training classifiers
    print('training, predicting and evaluating...')

    # Dummy Regression (baseline model)
    print('\nDummy Regression:')
    model = DummyRegressor(strategy='mean')
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # Linear Regression
    print('\nLinear_regression: ')
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # Ridge
    print('\nRidge: ')
    model = Ridge()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # passive aggresive
    print('\nPoly: ')
    model = PassiveAggressiveRegressor()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # AdaBoost
    print('\nAdaBoost: ')
    model = AdaBoostRegressor()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # Random Forest
    print('\nRandom Forest:')
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))
def main():

    # load training and testing data set
    print('parsing training set...')
    X_train, y_train = parse('./data_set/train_set.csv')
    print('parsing testing set...')
    X_test, y_test = parse('./data_set/test_set.csv')
    print('train set: ', X_train.shape)
    print('test set: ', X_test.shape)

    # The result turns out to be worse using non-linear polynomial regression
    # convert to polynomial features
    # print('converting to polynomial features...')
    # poly = PolynomialFeatures(2)
    # X_train = poly.fit_transform(X_train)
    # X_test = poly.fit_transform(X_test)
    # print('train set: ', X_train.shape)
    # print('test set: ', X_test.shape)

    # scale the attributes to [0, 1]
    print('standardizing the features...')
    min_max_scaler = MinMaxScaler()
    X_train = min_max_scaler.fit_transform(X_train)
    X_test = min_max_scaler.transform(X_test)

    # training classifiers
    print('training, predicting and evaluating...')

    # Dummy Regression (baseline model)
    print('\nDummy Regression: (baseline)')
    model = DummyRegressor(strategy='mean')
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # Linear Regression
    print('\nLinear_regression: ')
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # KNN Regression
    # print('\nKNN Regression: ')
    # model = KNeighborsRegressor()
    # model.fit(X_train, y_train)
    # y_pre = model.predict(X_test)
    # print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    # print('r2_score: ', r2_score(y_test, y_pre))

    # Neural Network - Bernoulli Restricted Boltzmann Machine (RBM)
    # print('\nNeural Network - RBM: ')
    # model = BernoulliRBM()
    # model.fit(X_train, y_train)
    # y_pre = model.predict(X_test)
    # print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    # print('r2_score: ', r2_score(y_test, y_pre))

    # AdaBoost
    print('\nAdaBoost: ')
    model = AdaBoostRegressor()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # Random Forest
    print('\nRandom Forest:')
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))
示例#56
0
def test_regressor_score_with_None(y, y_test):
    reg = DummyRegressor()
    reg.fit(None, y)
    assert_equal(reg.score(None, y_test), 1.0)
        X_fmri_train = X_fmri[train]
        X_fmri_test = X_fmri[test]
        X_meg_train = X_meg[train]
        X_meg_test = X_meg[test]
        y_train = y[train]
        y_test = y[test]

        X_train = np.hstack([X_fmri_train,X_meg_train])
        X_test = np.hstack([X_fmri_test,X_meg_test])
        
        pls.fit(X_train, y_train)
        pred = pls.predict(X_test)

        mae += mean_absolute_error(y_test, pred)

        dumb.fit(X_train, y_train)
        dumb_pred = dumb.predict(X_test)
        dumb_mae += mean_absolute_error(y_test,dumb_pred)

        if within:
            pls.fit(X_fmri_train, y_train)
            pred = pls.predict(X_fmri_test)
            fmri_mae += mean_absolute_error(y_test, pred)

            pls.fit(X_meg_train, y_train)
            pred = pls.predict(X_meg_test)
            meg_mae += mean_absolute_error(y_test, pred)

    comp_scores.append(mae/nfolds)
    dumb_scores.append(dumb_mae/nfolds)
    fmri_scores.append(fmri_mae/nfolds)
示例#58
0
    pls = PLSRegression(n_components=ncomp)
    dumb = DummyRegressor(strategy='mean')

    mae = 0
    dumb_mae = 0
    for oidx, (train, test) in enumerate(cv):
        X_fmri_train = X_fmri[train]
        X_fmri_test = X_fmri[test]
        X_meg_train = X_meg[train]
        X_meg_test = X_meg[test]
        
        pls.fit(X_fmri_train, X_meg_train)
        pred = pls.predict(X_fmri_test)

        mae += mean_absolute_error(X_meg_test, pred)

        dumb.fit(X_fmri_train, X_meg_train)
        dumb_pred = dumb.predict(X_fmri_test)
        dumb_mae += mean_absolute_error(X_meg_test,dumb_pred)

    comp_scores.append(mae/nfolds)
    dumb_scores.append(dumb_mae/nfolds)

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
plt.plot(max_comps,comp_scores,max_comps,dumb_scores)
t_str = seed + str(band)
plt.title(t_str)
plt.savefig(home+'/tmp/meg_fmri_%s_%s.png'%(seed,band[0]))