def Random_forest(features,target,test_size_percent=0.2,cv_split=3):
    X_array = features.as_matrix()
    y_array = target.as_matrix()        
    model_rdf = RandomForestRegressor()
    X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4)
    model_rdf.fit(X_train,y_train)
    test_prediction = model_rdf.predict(X_test)
    tscv = TimeSeriesSplit(cv_split)
    
    training_score = cross_val_score(model_rdf,X_train,y_train,cv=tscv.n_splits) 
    testing_score = cross_val_score(model_rdf,X_test,y_test,cv=tscv.n_splits)
    print"Cross-val Training score:", training_score.mean()
#    print"Cross-val Testing score:", testing_score.mean()
    training_predictions = cross_val_predict(model_rdf,X_train,y_train,cv=tscv.n_splits)
    testing_predictions = cross_val_predict(model_rdf,X_test,y_test,cv=tscv.n_splits)
    
    training_accuracy = metrics.r2_score(y_train,training_predictions) 
#    test_accuracy_model = metrics.r2_score(y_test,test_prediction_model)
    test_accuracy = metrics.r2_score(y_test,testing_predictions)
    
#    print"Cross-val predicted accuracy:", training_accuracy
    print"Test-predictions accuracy:",test_accuracy

    plot_model(target,y_train,y_test,training_predictions,testing_predictions)
    return model_rdf
def svm_regressor(features,target,test_size_percent=0.2,cv_split=5):
    
    scale=preprocessing.MinMaxScaler()
    X_array = scale.fit_transform(features)
    y_array = scale.fit_transform(target)  
    X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4)
    svr = SVR(kernel='rbf',C=10,gamma=1)
    svr.fit(X_train,y_train.ravel())
    test_prediction = svr.predict(X_test)
    tscv = TimeSeriesSplit(cv_split)
    
    training_score = cross_val_score(svr,X_train,y_train,cv=tscv.n_splits) 
    testing_score = cross_val_score(svr,X_test,y_test,cv=tscv.n_splits)
    print"Cross-val Training score:", training_score.mean()
#    print"Cross-val Testing score:", testing_score.mean()
    training_predictions = cross_val_predict(svr,X_train,y_train,cv=tscv.n_splits)
    testing_predictions = cross_val_predict(svr,X_test,y_test,cv=tscv.n_splits)
    
    training_accuracy = metrics.r2_score(y_train,training_predictions) 
#    test_accuracy_model = metrics.r2_score(y_test,test_prediction_model)
    test_accuracy = metrics.r2_score(y_test,testing_predictions)
    
#    print"Cross-val predicted accuracy:", training_accuracy
    print"Test-predictions accuracy:",test_accuracy
    return svr
Пример #3
0
def test_cross_val_predict_with_method():
    iris = load_iris()
    X, y = iris.data, iris.target
    X, y = shuffle(X, y, random_state=0)
    classes = len(set(y))

    kfold = KFold(len(iris.target))

    methods = ['decision_function', 'predict_proba', 'predict_log_proba']
    for method in methods:
        est = LogisticRegression()

        predictions = cross_val_predict(est, X, y, method=method)
        assert_equal(len(predictions), len(y))

        expected_predictions = np.zeros([len(y), classes])
        func = getattr(est, method)

        # Naive loop (should be same as cross_val_predict):
        for train, test in kfold.split(X, y):
            est.fit(X[train], y[train])
            expected_predictions[test] = func(X[test])

        predictions = cross_val_predict(est, X, y, method=method,
                                        cv=kfold)
        assert_array_almost_equal(expected_predictions, predictions)
def linear_regression(features,target,test_size_percent=0.2,cv_split=5):
    ''' Features -> Pandas Dataframe with attributes as columns
        target -> Pandas Dataframe with target column for prediction
        Test_size_percent -> Percentage of data point to be used for testing'''
    X_array = features.as_matrix()
    y_array = target.as_matrix()    
    ols = linear_model.LinearRegression()
    X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4)
#    model = ols.fit(X_train, y_train)
    ols.fit(X_train, y_train)
#    test_prediction_model = ols.predict(X_test)
    tscv = TimeSeriesSplit(cv_split)
    
    training_score = cross_val_score(ols,X_train,y_train,cv=tscv.n_splits) 
    testing_score = cross_val_score(ols,X_test,y_test,cv=tscv.n_splits)
    print"Cross-val Training score:", training_score.mean()
#    print"Cross-val Testing score:", testing_score.mean()
    training_predictions = cross_val_predict(ols,X_train,y_train,cv=tscv.n_splits)
    testing_predictions = cross_val_predict(ols,X_test,y_test,cv=tscv.n_splits)
    
    training_accuracy = metrics.r2_score(y_train,training_predictions) 
#    test_accuracy_model = metrics.r2_score(y_test,test_prediction_model)
    test_accuracy = metrics.r2_score(y_test,testing_predictions)
    
#    print"Cross-val predicted accuracy:", training_accuracy
    print"Test-predictions accuracy:",test_accuracy

    plot_model(target,y_train,y_test,training_predictions,testing_predictions)
    return ols
Пример #5
0
def scan2D(X, y, window=(10, 10), estimator_params=dict(n_jobs=-1), cv=3):
    "2D scanning"
    inputs, labels, instances = [], [], []
    instance_count = 0
    for sample, label in zip(X, y):
        sample_shape = sample.shape
        for s1 in range(sample.shape[0]-window[0]):
            for s2 in range(sample.shape[1]-window[1]):
                part = sample[s1:s1+window[0], s2:s2+window[1]]
                inputs.append(part.flatten())
                labels.append(label)
                instances.append(instance_count)
        instance_count += 1
    rf = RandomForestClassifier(**estimator_params)
    estimator_params.update({'max_features': 1})
    cf = RandomForestClassifier(**estimator_params)
    probas1 = cross_val_predict(rf, inputs, labels, cv=cv, method='predict_proba')
    probas2 = cross_val_predict(cf, inputs, labels, cv=cv, method='predict_proba')
    probas = []
    for instance in set(instances):
        mask = [i == instance for i in instances]
        p1 = probas1[mask]
        p2 = probas2[mask]
        p = np.concatenate([p1.flatten(), p2.flatten()], axis=0)
        probas.append(p)
    return probas
def neural_net(features,target,test_size_percent=0.2,cv_split=3,n_iter=100,learning_rate=0.01):
    '''Features -> Pandas Dataframe with attributes as columns
        target -> Pandas Dataframe with target column for prediction
        Test_size_percent -> Percentage of data point to be used for testing'''
    scale=preprocessing.MinMaxScaler()
    X_array = scale.fit_transform(features)
    y_array = scale.fit_transform(target)
    mlp = Regressor(layers=[Layer("Rectifier",units=5), # Hidden Layer1
                            Layer("Rectifier",units=3)  # Hidden Layer2
                            ,Layer("Linear")],     # Output Layer
                        n_iter = n_iter, learning_rate=0.01)
    X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4)
    mlp.fit(X_train,y_train)
    test_prediction = mlp.predict(X_test)
    tscv = TimeSeriesSplit(cv_split)
    
    training_score = cross_val_score(mlp,X_train,y_train,cv=tscv.n_splits) 
    testing_score = cross_val_score(mlp,X_test,y_test,cv=tscv.n_splits)
    print"Cross-val Training score:", training_score.mean()
#    print"Cross-val Testing score:", testing_score.mean()
    training_predictions = cross_val_predict(mlp,X_train,y_train,cv=tscv.n_splits)
    testing_predictions = cross_val_predict(mlp,X_test,y_test,cv=tscv.n_splits)
    
    training_accuracy = metrics.r2_score(y_train,training_predictions) 
#    test_accuracy_model = metrics.r2_score(y_test,test_prediction_model)
    test_accuracy = metrics.r2_score(y_test,testing_predictions)
    
#    print"Cross-val predicted accuracy:", training_accuracy
    print"Test-predictions accuracy:",test_accuracy

    plot_model(target,y_train,y_test,training_predictions,testing_predictions)
    return mlp
Пример #7
0
    def fit(self, X, y):
        # Check data
        X, y = np.array(X), np.array(y)
        X, y = check_X_y(X, y)
        # Split to grow cascade and validate
        mask = np.random.random(y.shape[0]) < self.validation_fraction
        X_tr, X_vl = X[mask], X[~mask]
        y_tr, y_vl = y[mask], y[~mask]

        self.classes_ = unique_labels(y)
        self.layers_, inp_tr, inp_vl = [], X_tr, X_vl
        self.scores_ = []

        # First layer
        forests = [RandomForestClassifier(max_features=1, n_estimators=self.n_estimators, min_samples_split=10, criterion='gini', n_jobs=-1),  # Complete random
                    RandomForestClassifier(max_features=1, n_estimators=self.n_estimators, min_samples_split=10, criterion='gini', n_jobs=-1),  # Complete random
                    RandomForestClassifier(n_estimators=self.n_estimators, n_jobs=-1),
                    RandomForestClassifier(n_estimators=self.n_estimators, n_jobs=-1)]
        _ = [f.fit(inp_tr, y_tr) for f in forests]
        p_vl = [f.predict_proba(inp_vl) for f in forests]
        labels = [self.classes_[i] for i in np.argmax(np.array(p_vl).mean(axis=0), axis=1)]
        score = self.scoring(y_vl, labels)
        self.layers_.append(forests)
        self.scores_.append(score)
        p_tr = [cross_val_predict(f, inp_tr, y_tr, cv=self.cv, method='predict_proba') for f in forests]

        # Fit other layers
        last_score = score
        inp_tr, inp_vl = np.concatenate([X_tr]+p_tr, axis=1), np.concatenate([X_vl]+p_vl, axis=1)
        while True:  # Grow cascade
            forests = [RandomForestClassifier(max_features=1, n_estimators=self.n_estimators, min_samples_split=10, criterion='gini', n_jobs=-1),  # Complete random
                    RandomForestClassifier(max_features=1, n_estimators=self.n_estimators, min_samples_split=10, criterion='gini', n_jobs=-1),  # Complete random
                    RandomForestClassifier(n_estimators=self.n_estimators, n_jobs=-1),
                    RandomForestClassifier(n_estimators=self.n_estimators, n_jobs=-1)]
            _ = [forest.fit(inp_tr, y_tr) for forest in forests] # Fit the forest
            p_vl = [forest.predict_proba(inp_vl) for forest in forests]
            labels = [self.classes_[i] for i in np.argmax(np.array(p_vl).mean(axis=0), axis=1)]
            score = self.scoring(y_vl, labels)

            if score - last_score > self.tolerance:
                self.layers_.append(forests)
                p_tr = [cross_val_predict(f, inp_tr, y_tr, cv=self.cv, method='predict_proba') for f in forests]
                inp_tr, inp_vl = np.concatenate([X_tr]+p_tr, axis=1), np.concatenate([X_vl]+p_vl, axis=1)
                self.scores_.append(score)
                last_score = score
                print(self.scores_)
            else:
                break
        # Retrain on entire dataset
        inp_ = X
        for forests in self.layers_:
            _ = [f.fit(inp_, y) for f in forests]
            p = [cross_val_predict(f, inp_, y, cv=self.cv, method='predict_proba') for f in forests]
            inp_ = np.concatenate([X]+p, axis=1)
        return self
Пример #8
0
def test_cross_val_predict_sparse_prediction():
    # check that cross_val_predict gives same result for sparse and dense input
    X, y = make_multilabel_classification(n_classes=2, n_labels=1,
                                          allow_unlabeled=False,
                                          return_indicator=True,
                                          random_state=1)
    X_sparse = csr_matrix(X)
    y_sparse = csr_matrix(y)
    classif = OneVsRestClassifier(SVC(kernel='linear'))
    preds = cross_val_predict(classif, X, y, cv=10)
    preds_sparse = cross_val_predict(classif, X_sparse, y_sparse, cv=10)
    preds_sparse = preds_sparse.toarray()
    assert_array_almost_equal(preds_sparse, preds)
Пример #9
0
def test_cross_val_predict_pandas():
    # check cross_val_score doesn't destroy pandas dataframe
    types = [(MockDataFrame, MockDataFrame)]
    try:
        from pandas import Series, DataFrame
        types.append((Series, DataFrame))
    except ImportError:
        pass
    for TargetType, InputFeatureType in types:
        # X dataframe, y series
        X_df, y_ser = InputFeatureType(X), TargetType(y2)
        check_df = lambda x: isinstance(x, InputFeatureType)
        check_series = lambda x: isinstance(x, TargetType)
        clf = CheckingClassifier(check_X=check_df, check_y=check_series)
        cross_val_predict(clf, X_df, y_ser)
Пример #10
0
 def fit(self, x, y, **params):
     """ fit training data """
     preds = []
     for i, clf in enumerate(self.clfs):
         log.info("fit %s"%i)
         if "Keras" in str(clf) and "verbose" in params:
             params["fit_params"] = dict(verbose=params["verbose"])
             
         # save out-of-fold predictions to fit metaclf
         if clf.hasattr("predict_proba"):
             method = "predict_proba"
         else:
             method = "predict"
         pred = cross_val_predict(clf, x, y, 
                                  cv=self.cv, verbose=0,
                                  method=method,
                                  **params)        
         preds.append(pred)
         
         # fully fitted to predict test data
         clf.fit(x, y, verbose=0)
     
     # fit metaclf on out-of-fold predictions
     log.info("fit metaclf")
     self.metaclf.fit(np.hstack(preds), y)
     return self
Пример #11
0
 def crossval(self, verbose=0, seed=0, method="predict", **params):
     """ returns crossval score
         sets self.preds
     """
     # track time spent per run
     starttime = time()
     
     np.random.seed(seed)         
 
     # useful for keras but throws exception for others
     if "Keras" in get_clfname(self.clf):
         self.clf.set_params(verbose=verbose)
         
     self.clf.set_params(**params)
     
     self.preds = cross_val_predict(self.clf, self.xtrain, self.ytrain,
                                    method=method)
     score = self.scorer._score_func(self.ytrain, self.preds) \
                     * self.scorer._sign
     
     # log results
     params.update(clf=get_clfname(self.clf),
                   name=self.name,
                   score=score, 
                   elapsed=time()-starttime)
     if self.runs:
         self.runs.append(params, self.preds)
         
     return score
def test_cross_val_predict_input_types():
    clf = Ridge()
    # Smoke test
    predictions = cross_val_predict(clf, X, y)
    assert_equal(predictions.shape, (10,))

    # test with multioutput y
    predictions = cross_val_predict(clf, X_sparse, X)
    assert_equal(predictions.shape, (10, 2))

    predictions = cross_val_predict(clf, X_sparse, y)
    assert_array_equal(predictions.shape, (10,))

    # test with multioutput y
    predictions = cross_val_predict(clf, X_sparse, X)
    assert_array_equal(predictions.shape, (10, 2))

    # test with X and y as list
    list_check = lambda x: isinstance(x, list)
    clf = CheckingClassifier(check_X=list_check)
    predictions = cross_val_predict(clf, X.tolist(), y.tolist())

    clf = CheckingClassifier(check_y=list_check)
    predictions = cross_val_predict(clf, X, y.tolist())

    # test with 3d X and
    X_3d = X[:, :, np.newaxis]
    check_3d = lambda x: x.ndim == 3
    clf = CheckingClassifier(check_X=check_3d)
    predictions = cross_val_predict(clf, X_3d, y)
    assert_array_equal(predictions.shape, (10,))
Пример #13
0
 def crossVertifyTestData(self):
     """
     交叉验证Test数据并返回结果
         :param self: 类变量本身
         :returns: 返回真正的y和预测的y,真正的y在前面
     """   
     # 进行交叉验证
     predict_y = cross_val_predict(self.model, self.test_X, cv=10)
     return self.test_y, predict_y
Пример #14
0
    def _get_estimator_mse(self, x, y, estimator):
        """Return the RMSE for *estimator*.

        Use GroupKFold where a group is a combination of input size and number
        of workers. The prediction of a group is done when it is out of the
        training set.
        """
        groups = self._groups.loc[x.index]
        cv = GroupKFold(n_splits=3)
        prediction = cross_val_predict(estimator, x, y, groups, cv)
        return metrics.mean_squared_error(y, prediction)
Пример #15
0
    def evaluate(self, exp):
        """Split data, fit, transfrom features, tf*idf, svd, report."""
        t1 = time()

        exp.seed = 42
        exp.nj = -1
        exp.test_size = 0.3 if not hasattr(exp, 'test_size') else exp.test_size
        np.random.RandomState(exp.seed)

        # report features
        if hasattr(exp.pln[0], 'features'):
            exp.log.head(exp.pln.features, exp.name, exp.seed)

        # stream data to features
        X, y = exp.vec.fit_transform(exp.data)

        # if no test data, split
        if not hasattr(self, 'test_data'):
            X, Xi, y, yi = train_test_split(
                X, y, test_size=exp.test_size, stratify=y)
        else:
            Xi, yi = exp.vec.transform(self.test_data)

        av = self.average
        # grid search and fit best model choice
        exp.pln = self.grid_search(exp.pln, X, y, exp.seed)
        print("\n Training model...")
        exp.pln.fit(X, y)
        print(" done!")

        labs = exp.vec.encoder.classes_
        exp.log.data('sparse', 'train', X)

        # if user wants to report more than best score, do another CV on train
        # if hasattr(self, 'detailed_train'):
        sco = cross_val_predict(exp.pln, X, y, cv=self.cv, n_jobs=exp.nj)
        self.res['train'] = exp.log.report('train', y, sco, av, labs)

        exp.log.data('sparse', 'test', Xi, dump=True)
        res = exp.pln.predict(Xi)
        self.res['test'] = exp.log.report('test', yi, res, av, labs)

        if hasattr(self, 'proportions'):
            self._run_proportions((X, Xi, y, yi), exp)

        print("\n # ------------------------------------------ \n")
        t2 = time()
        dur = round(t2 - t1, 1)
        self.res['dur'] = dur
        print("\n Experiment took {0} seconds".format(dur))

        exp.store()
        print("\n" + '-' * 10, "\n")
Пример #16
0
def save_fit_plot(x, y, fit, name, folder):
    predicted = cross_val_predict(fit, x, y, cv=10)
    linfit = np.polyfit(y, predicted, 1)

    fig, ax = plt.subplots()
    ax.scatter(y, predicted, s=1, alpha=0.1)
    ax.plot([y.min(), y.max()], [y.min(), y.max()], "k--", lw=2)
    ax.plot(y, np.poly1d(linfit)(y), "g--", lw=2)
    ax.set_xlabel("Measured")
    ax.set_ylabel("Predicted")
    f_name = timed_filename(name, "pdf")
    plt.savefig(os.path.join(folder, f_name))
Пример #17
0
def test_cross_val_predict():
    """Test cross_val_predict with predict_proba."""
    from sklearn.linear_model import LinearRegression
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    from sklearn.base import BaseEstimator, clone
    from sklearn.model_selection import cross_val_predict
    rng = np.random.RandomState(42)
    X = rng.randn(10, 1, 3)
    y = rng.randint(0, 2, 10)

    estimator = SlidingEstimator(LinearRegression())
    cross_val_predict(estimator, X, y, cv=2)

    class Classifier(BaseEstimator):
        """Moch class that does not have classes_ attribute."""

        def __init__(self):
            self.base_estimator = LinearDiscriminantAnalysis()

        def fit(self, X, y):
            self.estimator_ = clone(self.base_estimator).fit(X, y)
            return self

        def predict_proba(self, X):
            return self.estimator_.predict_proba(X)

    with pytest.raises(AttributeError, match="classes_ attribute"):
        estimator = SlidingEstimator(Classifier())
        cross_val_predict(estimator, X, y, method='predict_proba', cv=2)

    estimator = SlidingEstimator(LinearDiscriminantAnalysis())
    cross_val_predict(estimator, X, y, method='predict_proba', cv=2)
Пример #18
0
def test_cross_val_predict():
    """Make sure it works in cross_val_predict."""

    X, y = load_iris(return_X_y=True)
    X = StandardScaler().fit_transform(X)

    clf = FMClassifier(rank=2, solver='L-BFGS-B', random_state=4567).fit(X, y)

    cv = KFold(n_splits=4, random_state=457, shuffle=True)
    y_oos = cross_val_predict(clf, X, y, cv=cv, method='predict')
    acc = accuracy_score(y, y_oos)

    assert acc >= 0.90, "accuracy is too low for iris in cross_val_predict!"
def cross_val_pred_plot(model,X,y,consum_col,consum_col_pred,denorm_target,model_name=None,print_plot=False,cv=5):
    if 'multi' or 'mlp' or 'preceptron' in model_name.lower():
        warnings.filterwarnings("ignore", category=DeprecationWarning) #run this line separately
        whole_pred = cross_val_predict(model,X.values,y.values,cv=5)
    else:
        whole_pred = cross_val_predict(model,X,y,cv=5)
    whole_predictions=pd.Series(whole_pred.ravel(),index=y.index)
    whole_predictions = whole_predictions.rename(consum_col_pred)
    whole = pd.DataFrame(whole_predictions).join(y)
    whole[whole[consum_col_pred] <0.0] = 0
    r2 = metrics.r2_score(y,whole_pred)
    if print_plot:
        if ('multi' or 'mlp' or 'preceptron') in model_name.lower():
            whole.plot(title=model_name+'-Whole dataset predictions - score {}'.format(r2))
        else:
            if model_name==None:
                model_name = 'Model';print"\nInsert model name\n";
            whole.plot(title=model_name+'-Whole dataset predictions - score {}'.format(r2))
        plt.ylabel('Power consumption in Watts')
#        plt.xlabel('Date Time')
    #    print"\nR2 score: ",metrics.r2_score(y,whole_pred),"\n"
    
    if (model_name == 'svr') or (model_name == 'mlp'):
        denorm_whole = whole*(denorm_target.max().values[0]-denorm_target.min().values[0])+denorm_target.min().values[0]
        mae = metrics.mean_absolute_error(denorm_whole[consum_col],denorm_whole[consum_col_pred])
        mse = metrics.mean_squared_error(denorm_whole[consum_col],denorm_whole[consum_col_pred])
        whole = denorm_whole
#        if 'mlp' in model_name:
#            print'calculating metrics of MLP'
#            acc = model.score(X.values,y.values)
#        else:
#            print'calculating metrics of SVR'
#            acc = model.score(X,y)
    else:
        print'calculating metrics of LNR or RDF'
        mae = metrics.mean_absolute_error(y,whole_pred)
        mse = metrics.mean_squared_error(y,whole_pred)
#        acc = model.score(X,y)
    return whole,r2,mae,mse
Пример #20
0
def test_cross_val_predict_class_subset():

    X = np.arange(8).reshape(4, 2)
    y = np.array([0, 0, 1, 2])
    classes = 3

    kfold3 = KFold(n_splits=3)
    kfold4 = KFold(n_splits=4)

    le = LabelEncoder()

    methods = ['decision_function', 'predict_proba', 'predict_log_proba']
    for method in methods:
        est = LogisticRegression()

        # Test with n_splits=3
        predictions = cross_val_predict(est, X, y, method=method,
                                        cv=kfold3)

        # Runs a naive loop (should be same as cross_val_predict):
        expected_predictions = get_expected_predictions(X, y, kfold3, classes,
                                                        est, method)
        assert_array_almost_equal(expected_predictions, predictions)

        # Test with n_splits=4
        predictions = cross_val_predict(est, X, y, method=method,
                                        cv=kfold4)
        expected_predictions = get_expected_predictions(X, y, kfold4, classes,
                                                        est, method)
        assert_array_almost_equal(expected_predictions, predictions)

        # Testing unordered labels
        y = [1, 1, -4, 6]
        predictions = cross_val_predict(est, X, y, method=method,
                                        cv=kfold3)
        y = le.fit_transform(y)
        expected_predictions = get_expected_predictions(X, y, kfold3, classes,
                                                        est, method)
        assert_array_almost_equal(expected_predictions, predictions)
Пример #21
0
def test_cross_val_predict():
    boston = load_boston()
    X, y = boston.data, boston.target
    cv = KFold()

    est = Ridge()

    # Naive loop (should be same as cross_val_predict):
    preds2 = np.zeros_like(y)
    for train, test in cv.split(X, y):
        est.fit(X[train], y[train])
        preds2[test] = est.predict(X[test])

    preds = cross_val_predict(est, X, y, cv=cv)
    assert_array_almost_equal(preds, preds2)

    preds = cross_val_predict(est, X, y)
    assert_equal(len(preds), len(y))

    cv = LeaveOneOut()
    preds = cross_val_predict(est, X, y, cv=cv)
    assert_equal(len(preds), len(y))

    Xsp = X.copy()
    Xsp *= (Xsp > np.median(Xsp))
    Xsp = coo_matrix(Xsp)
    preds = cross_val_predict(est, Xsp, y)
    assert_array_almost_equal(len(preds), len(y))

    preds = cross_val_predict(KMeans(), X)
    assert_equal(len(preds), len(y))

    class BadCV():
        def split(self, X, y=None, labels=None):
            for i in range(4):
                yield np.array([0, 1, 2, 3]), np.array([4, 5, 6, 7, 8])

    assert_raises(ValueError, cross_val_predict, est, X, y, cv=BadCV())
Пример #22
0
def test_ridge_gcv_sample_weights(
        gcv_mode, X_constructor, fit_intercept, n_features, y_shape, noise):
    alphas = [1e-3, .1, 1., 10., 1e3]
    rng = np.random.RandomState(0)
    n_targets = y_shape[-1] if len(y_shape) == 2 else 1
    X, y = _make_sparse_offset_regression(
        n_samples=11, n_features=n_features, n_targets=n_targets,
        random_state=0, shuffle=False, noise=noise)
    y = y.reshape(y_shape)

    sample_weight = 3 * rng.randn(len(X))
    sample_weight = (sample_weight - sample_weight.min() + 1).astype(int)
    indices = np.repeat(np.arange(X.shape[0]), sample_weight)
    sample_weight = sample_weight.astype(float)
    X_tiled, y_tiled = X[indices], y[indices]

    cv = GroupKFold(n_splits=X.shape[0])
    splits = cv.split(X_tiled, y_tiled, groups=indices)
    kfold = RidgeCV(
        alphas=alphas, cv=splits, scoring='neg_mean_squared_error',
        fit_intercept=fit_intercept)
    # ignore warning from GridSearchCV: DeprecationWarning: The default of the
    # `iid` parameter will change from True to False in version 0.22 and will
    # be removed in 0.24
    with ignore_warnings(category=DeprecationWarning):
        kfold.fit(X_tiled, y_tiled)

    ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept)
    splits = cv.split(X_tiled, y_tiled, groups=indices)
    predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits)
    kfold_errors = (y_tiled - predictions)**2
    kfold_errors = [
        np.sum(kfold_errors[indices == i], axis=0) for
        i in np.arange(X.shape[0])]
    kfold_errors = np.asarray(kfold_errors)

    X_gcv = X_constructor(X)
    gcv_ridge = RidgeCV(
        alphas=alphas, store_cv_values=True,
        gcv_mode=gcv_mode, fit_intercept=fit_intercept)
    gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight)
    if len(y_shape) == 2:
        gcv_errors = gcv_ridge.cv_values_[:, :, alphas.index(kfold.alpha_)]
    else:
        gcv_errors = gcv_ridge.cv_values_[:, alphas.index(kfold.alpha_)]

    assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_)
    assert_allclose(gcv_errors, kfold_errors, rtol=1e-3)
    assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3)
    assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3)
Пример #23
0
def cv_BIKE_Ridge( A_list, yV, alpha = 0.5, XX = None, n_splits = 5, n_jobs = -1, grid_std = None):

	clf = binary_model.BIKE_Ridge( A_list, XX, alpha = alpha)
	ln = A_list[0].shape[0] # ls is the number of molecules.
	kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
	kf_n = kf5_ext_c.split( A_list[0])

	AX_idx = np.array([list(range( ln))]).T
	yV_pred = model_selection.cross_val_predict( clf, AX_idx, yV, cv = kf_n, n_jobs = n_jobs)

	print('The prediction output using cross-validation is given by:')
	jutil.cv_show( yV, yV_pred, grid_std = grid_std)

	return yV_pred	
 def fit(self,proba_exclude=False,proba_threshold=0.5,n_jobs=1,cv=None,clf=None):
     from sklearn.linear_model import LogisticRegressionCV
     from sklearn.model_selection import cross_val_predict,KFold
     from sklearn.pipeline import Pipeline
     from sklearn.preprocessing import StandardScaler
     decision_features = self.decision_features
     auto_labels = self.auto_labels
     if cv is None:
         cv = KFold(n_splits=5,shuffle=True,random_state=12345)
     if clf is None:
         clf = LogisticRegressionCV(Cs=np.logspace(-4,6,11),
                                cv=cv,
                                tol=1e-5,
                                max_iter=int(1e4),
                                scoring='roc_auc',
                                class_weight='balanced',
                                n_jobs=n_jobs)
         clf = Pipeline([('scaler',StandardScaler()),
                     ('estimator',clf)])
     
     try:
         auto_proba = cross_val_predict(clf,decision_features,auto_labels,cv=cv,method='predict_proba',n_jobs=n_jobs)
         auto_proba = auto_proba[:,-1]
     except:
         try:
             auto_proba = cross_val_predict(clf,decision_features,auto_labels,cv=5,method='predict_proba',n_jobs=n_jobs)
             auto_proba = auto_proba[:,-1]
         except:
             
             auto_proba = cross_val_predict(clf,decision_features,auto_labels,cv=3,method='predict_proba',n_jobs=n_jobs)
             auto_proba = auto_proba[:,-1]
     if proba_exclude:
         idx_ = np.where(auto_proba < proba_threshold)
         auto_labels[idx_] = 0
         #auto_proba[idx_]
     self.auto_labels = auto_labels
     self.auto_proba = auto_proba
Пример #25
0
def check_cross_val_predict_with_method(est):
    iris = load_iris()
    X, y = iris.data, iris.target
    X, y = shuffle(X, y, random_state=0)
    classes = len(set(y))

    kfold = KFold(len(iris.target))

    methods = ['decision_function', 'predict_proba', 'predict_log_proba']
    for method in methods:
        predictions = cross_val_predict(est, X, y, method=method)
        assert_equal(len(predictions), len(y))

        expected_predictions = np.zeros([len(y), classes])
        func = getattr(est, method)

        # Naive loop (should be same as cross_val_predict):
        for train, test in kfold.split(X, y):
            est.fit(X[train], y[train])
            expected_predictions[test] = func(X[test])

        predictions = cross_val_predict(est, X, y, method=method,
                                        cv=kfold)
        assert_array_almost_equal(expected_predictions, predictions)

        # Test alternative representations of y
        predictions_y1 = cross_val_predict(est, X, y + 1, method=method,
                                           cv=kfold)
        assert_array_equal(predictions, predictions_y1)

        predictions_y2 = cross_val_predict(est, X, y - 2, method=method,
                                           cv=kfold)
        assert_array_equal(predictions, predictions_y2)

        predictions_ystr = cross_val_predict(est, X, y.astype('str'),
                                             method=method, cv=kfold)
        assert_array_equal(predictions, predictions_ystr)
Пример #26
0
def scan1D(X, y, window=100, estimator_params=dict(n_jobs=-1), cv=3):
    "Sliding scanner for variable length input samples"
    inputs, labels, instances = [], [], []
    instance_count = 0
    for sample, label in zip(X, y):
        sample_len = len(sample)
        for s in range(sample_len-window):
            inputs.append(sample[s: s+window].flatten())
            labels.append(label)
            instances.append(instance_count)
        instance_count += 1
    rf = RandomForestClassifier(**estimator_params)
    estimator_params.update({'max_features': 1})
    cf = RandomForestClassifier(**estimator_params)
    probas1 = cross_val_predict(rf, inputs, labels, cv=cv, method='predict_proba')
    probas2 = cross_val_predict(cf, inputs, labels, cv=cv, method='predict_proba')
    probas = []
    for instance in set(instances):
        mask = [i == instance for i in instances]
        p1 = probas1[mask]
        p2 = probas2[mask]
        p = np.concatenate([p1.flatten(), p2.flatten()], axis=0)
        probas.append(p)
    return probas
    def XValidatePredict(self, labels, values, folds, stratified=True):
        '''
        :param labels: class of each sample
        :param values: feature values for each sample
        :param folds: number of folds
        :param stratified: boolean whether to use stratified K fold
        :return: cross-validated estimates for each input data point
        '''
        num_samples = values.shape[0]
        if stratified:
            CV = folds
        else:
            CV = KFold(num_samples, folds)

        predictions = cross_val_predict(self.classifier, X=values, y=labels, cv=CV, n_jobs=1)
        return np.array(predictions)
Пример #28
0
def test_cross_val_predict():
    # Make sure it works in cross_val_predict for multiclass.

    X, y = load_iris(return_X_y=True)
    y = LabelBinarizer().fit_transform(y)
    X = StandardScaler().fit_transform(X)

    mlp = MLPClassifier(n_epochs=10,
                        solver_kwargs={'learning_rate': 0.05},
                        random_state=4567).fit(X, y)

    cv = KFold(n_splits=4, random_state=457, shuffle=True)
    y_oos = cross_val_predict(mlp, X, y, cv=cv, method='predict_proba')
    auc = roc_auc_score(y, y_oos, average=None)

    assert np.all(auc >= 0.96)
Пример #29
0
def test_cross_validation_is_finite(estimator, build_dataset):
  """Tests that validation on metric-learn estimators returns something finite
  """
  input_data, labels, preprocessor, _ = build_dataset()
  estimator = clone(estimator)
  estimator.set_params(preprocessor=preprocessor)
  set_random_state(estimator)
  assert np.isfinite(cross_val_score(estimator,
                                     *remove_y_quadruplets(estimator,
                                                           input_data,
                                                           labels))).all()
  assert np.isfinite(cross_val_predict(estimator,
                                       *remove_y_quadruplets(estimator,
                                                             input_data,
                                                             labels)
                                       )).all()
Пример #30
0
def cv_SVR( xM, yV, svr_params, n_splits = 5, n_jobs = -1, grid_std = None, graph = True, shuffle = True):
	"""
	method can be 'Ridge', 'Lasso'
	cross validation is performed so as to generate prediction output for all input molecules
	"""	
	print(xM.shape, yV.shape)

	clf = svm.SVR( **svr_params)
	kf_n_c = model_selection.KFold( n_splits=n_splits, shuffle=shuffle)
	kf_n = kf5_ext_c.split( xM)
	yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)

	if graph:
		print('The prediction output using cross-validation is given by:')
		jutil.cv_show( yV, yV_pred, grid_std = grid_std)

	return yV_pred
Пример #31
0
validationErrors = []
for num in range(1, 4):
    print(f'Degree {num} ')
    poly_reg = PolynomialFeatures(degree=num)
    X_poly = poly_reg.fit_transform(X_train)
    poly_reg.fit(X_poly, y_train)
    lin_reg_2 = LinearRegression(n_jobs=-1)
    lin_reg_2.fit(X_poly, y_train)
    y_pred = lin_reg_2.predict(X_poly)

    traniningMSE = (y_pred - y_train)**2
    traniningMSE = (np.sum(traniningMSE)) / len(y_pred)
    print(f'traning error {traniningMSE}')
    traningError.append(traniningMSE)

    prediction = cross_val_predict(lin_reg_2, X_poly, y_train, cv=5)
    validationError = (prediction - y_train)**2
    validationError = (np.sum(validationError)) / len(prediction)
    print(f'valiadtion error {validationError} \n')
    validationErrors.append(validationError)

validationErrors = np.array(validationErrors)
pos = validationErrors.argmin()

#printing results for tranning error and validation erro
fig, ax = plt.subplots()
ax.plot(list(range(1, 4)), traningError, '-', label='training data')
ax.plot(list(range(1, 4)), validationErrors, '-', label='validation data')
ax.axvline(x=list(range(1, 4))[pos], linestyle='--', label='best fit')
ax.set_xlabel('Model Complexity (Degree)')
ax.set_ylabel('Mean Squared Error')
Пример #32
0
model.fit(train_x, train_y)
prediction = model.predict(test_x)
print('The accuracy of the Random Forests is',
      metrics.accuracy_score(prediction, test_y))

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

kfold = KFold(n_splits=10, random_state=1998)

cv_result = cross_val_score(model, x, y, cv=kfold, scoring='accuracy')
print(cv_result.mean(), cv_result.std())

from sklearn.metrics import confusion_matrix

y_pred = cross_val_predict(RandomForestClassifier(n_estimators=100),
                           x,
                           y,
                           cv=10)
sns.heatmap(confusion_matrix(y, y_pred), annot=True, fmt='2.0f')
plt.title('Random Forest Confusion Matrix')

test.info()

train.info()

y_pred = model.predict(test)

df = pd.DataFrame(y_pred)
df.to_csv('pred.csv')
Пример #33
0
def regression():
    warnings.filterwarnings('ignore')
    warnings.simplefilter('ignore')
    pd.options.display.float_format = '{:.2f}'.format
    # init_notebook_mode(connected=True)

    connection = MongoClient(os.environ["MONGODB_URL"])
    db = connection.admin.mobilede

    allData = pd.read_csv(os.environ['DIR-WORKING'] + '/vehicles.csv',
                          delimiter=';')
    allDataUnmodified = pd.read_csv(os.environ['DIR-WORKING'] +
                                    '/vehicles.csv',
                                    delimiter=';')

    # Print some basic data
    print("Price of all vehicles: {:6.2f} EUR".format(
        allData['priceEur'].mean()))

    # decide on columns - features and target - with which we are going to work
    target_column = 'priceEur'
    feature_columns = [
        'id', 'kmState', 'numOfPrevOwners', 'makeModel', 'power',
        'firstRegistration', 'derivedKmPerYear', 'emissionSticker', 'plz',
        'numOfSeats'
    ]

    feature_and_target_columns = list(feature_columns)
    feature_and_target_columns.append(target_column)
    regressionData = allData[feature_and_target_columns]

    # transform strings to NaN where appropriate and drop the rows,
    # where at least one feature value is missed
    prevSize = regressionData.shape[0]
    regressionData = regressionData.replace('unknown', np.NaN)
    regressionData = regressionData.replace('NaN', np.NaN)
    regressionData = regressionData.replace('nan', np.NaN)
    regressionData = regressionData.dropna()

    # make some features conversion, like from numerical into categorical
    regressionData['makeModel'] = regressionData['makeModel'].astype(
        'category').cat.codes
    regressionData['plz'] = regressionData['plz'].astype('category').cat.codes
    regressionData['firstRegistration'] = regressionData[
        'firstRegistration'].astype('category').cat.codes
    regressionData['emissionSticker'] = regressionData[
        'emissionSticker'].astype('category').cat.codes
    regressionData['numOfPrevOwners'] = regressionData[
        'numOfPrevOwners'].astype('float64')

    # remember the order of IDs, and do not use it in regression
    idValues = regressionData['id']
    del regressionData['id']
    feature_columns.remove('id')
    feature_and_target_columns.remove('id')

    print("Left {}/{} entries after clean up and drop".format(
        regressionData.shape[0], prevSize))

    featuresData = regressionData[feature_columns]
    y = regressionData[target_column]

    # Calculate Regression
    regr = linear_model.LinearRegression(normalize=True)

    regr.fit(featuresData, y)
    predictions = regr.predict(featuresData)

    print()
    zipped = zip(feature_columns, regr.coef_)
    for name, coef in zipped:
        print("{}: {}".format(name, coef))

    predicted = cross_val_predict(regr, featuresData, y, cv=5)

    # # x - predicted values
    # # y - actual values
    # fix, ax = plt.subplots()
    # ax.scatter(y, predicted, color='green', s=9)
    # ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=1)
    # ax.set_xlabel("Observed")
    # # TODO: add regression coefficient to the plot
    # ax.set_ylabel("Predicted")
    #
    # plt.savefig(os.environ['DIR-DATA'] + '/price_observed_vs_predicted.png')

    zipped = zip(idValues, y, predicted)

    for itemId, observedValue, predictedValue in zipped:
        itemIdStr = str(itemId)
        origRow = allData.loc[allData['id'] == itemId]
        goneOnStr = "{}".format(origRow['goneOn'].asobject[0])
        firstSeenOn = "{}".format(origRow['firstSeenOn'].asobject[0])

        if not "nan" in goneOnStr:
            daysOnline = (parser.parse(goneOnStr) -
                          parser.parse(firstSeenOn)).days
        else:
            daysOnline = -1

        try:
            derivedKmPerYear = "{:.0f}".format(
                origRow['derivedKmPerYear'].asobject[0])
        except Exception as e:
            print("Failed on derivedKmPerYear, id {} with {}".format(
                itemId, e))

        dbItem = db.find_one({"id": itemIdStr})
        dbItem["daysOnline"] = daysOnline
        dbItem["derivedKmPerYear"] = derivedKmPerYear
        dbItem["predictedPrice"] = '{:.0f}'.format(predictedValue)
        dbItem["diffSaving"] = int(predictedValue - observedValue)
        dbItem["diffSavingPercent"] = '{:.03f}'.format(
            (predictedValue - observedValue) / observedValue)
        dbItem["inputIsRegressionExcluded"] = ""
        dbItem["inputIsFavoured"] = ""

        result = db.update({"id": itemIdStr}, dbItem)

    print("*** DONE **** DONE ***")
Пример #34
0
neural_network = KerasClassifier(build_fn=create_network, 
                                 epochs=250)

from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
clinicalOutput = np.array([number[0] for number in lb.fit_transform(clinicalOutput)])

results = cross_validate(neural_network, clinicalInput, clinicalOutput,cv=10,scoring=("accuracy","f1","recall","precision"))

import matplotlib.pyplot as plt
plt.plot(results["test_accuracy"],color="c")
plt.plot(results["test_f1"],color="m")
plt.plot(results["test_recall"],color="y")
plt.plot(results["test_precision"],color="k")
plt.title("Model Information (CNNLSTM)")
plt.ylabel("Model Performance")
plt.xlabel("Number of Folds")
plt.legend(["Accuracy","F1-Score","Recall","Precision"], loc="lower right")
plt.show()

#Determine the prediction
y_pred = cross_val_predict(neural_network, clinicalInput, clinicalOutput, cv=10)

#Provide AUC score
from sklearn.metrics import roc_auc_score

print("Accuracy result: ", np.mean(results["test_accuracy"]))
print("Recall result: ", np.mean(results["test_recall"]))
print("Precision result: ", np.mean(results["test_precision"]))
print("F1 result: ", np.mean(results["test_f1"]))
print("ROC: ", roc_auc_score(clinicalOutput, y_pred))
Пример #35
0
]

# オートスケーリング
autoscaled_y_train = (y_train - y_train.mean()) / y_train.std()
autoscaled_x_train = (x_train - x_train.mean()) / x_train.std()

# クロスバリデーションによるカーネル関数の最適化
cross_validation = KFold(n_splits=fold_number, random_state=9,
                         shuffle=True)  # クロスバリデーションの分割の設定
r2cvs = []  # 空の list。カーネル関数ごとに、クロスバリデーション後の r2 を入れていきます
for index, kernel in enumerate(kernels):
    print(index + 1, '/', len(kernels))
    model = GaussianProcessRegressor(alpha=0, kernel=kernel)
    estimated_y_in_cv = np.ndarray.flatten(
        cross_val_predict(model,
                          autoscaled_x_train,
                          autoscaled_y_train,
                          cv=cross_validation))
    estimated_y_in_cv = estimated_y_in_cv * y_train.std(
        ddof=1) + y_train.mean()
    r2cvs.append(r2_score(y_train, estimated_y_in_cv))
optimal_kernel_number = np.where(
    r2cvs == np.max(r2cvs))[0][0]  # クロスバリデーション後の r2 が最も大きいカーネル関数の番号
optimal_kernel = kernels[optimal_kernel_number]  # クロスバリデーション後の r2 が最も大きいカーネル関数
print('クロスバリデーションで選択されたカーネル関数の番号 :', optimal_kernel_number)
print('クロスバリデーションで選択されたカーネル関数 :', optimal_kernel)

# モデル構築
model = GaussianProcessRegressor(alpha=0, kernel=optimal_kernel)  # GPR モデルの宣言
model.fit(autoscaled_x_train, autoscaled_y_train)  # モデル構築

# トレーニングデータの推定
Пример #36
0
np.set_printoptions(threshold=np.inf)
with open("dataset.txt", "w") as f:
    f.write(str(dataset))

###機器學習
#分割訓練集與測試集
X = dataset["data"]
y = dataset["target"]
X_train, X_test, y_train, y_test = \
         train_test_split(X, y, test_size=0.2, random_state=0)

#建立預測模型
print("開始建立預測模型...")
time.sleep(2)
reg = LinearRegression()
predicted = cross_val_predict(reg, X, y, cv=10)
reg.fit(X_train, y_train)
accuracy_train = reg.score(X_train, y_train)
accuracy_test = reg.score(X_test, y_test)
predict_y = reg.predict(X_test)
time.sleep(1)
print("建立完成")
print()
time.sleep(2)
print("訓練集預測分數為 %s" % (accuracy_train))
print("測試集預測分數為 %s" % (accuracy_test))

#畫出預測成果
plt.scatter(predicted, y, s=2)
plt.plot(predict_y, predict_y, 'ro')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
Пример #37
0
)
print(-1 * cross_val_score(lre,
                           x_data[['horsepower']],
                           y_data,
                           cv=4,
                           scoring='neg_mean_squared_error'))

print(
    'Calculate the average R^2 using two folds, find the average R^2 for the second fold utilizing the horsepower as a feature :'
)
Rcross1 = cross_val_score(lre, x_data[['horsepower']], y_data, cv=2)
print(Rcross1[1])
print(
    'You can also use the function <cross_val_predict> to predict the output. The function splits up the data into the specified number of folds, using one fold to get a prediction while the rest of the folds are used as test data. First import the function:'
)
yhat = cross_val_predict(lre, x_data[['horsepower']], y_data, cv=4)
print(yhat[0:5])

#part 2: Over/Under fitting and model selection
lr = LinearRegression()
lr.fit(x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']],
       y_train)

yhat_train = lr.predict(
    x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']])
print(yhat_train[0:5])

yhat_test = lr.predict(
    x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']])
print(yhat_test[0:5])
Пример #38
0
scores = cross_validate(estimator=grid_search,
                        X=X,
                        y=y,
                        cv=5,
                        error_score='raise',
                        return_estimator=True,
                        scoring=scoring)  # outer
print('Scores: {}'.format(scores['test_score']))
print('Mean score: {}'.format(np.mean(scores['test_score'])))

# Creamos clasificador 'tonto' y obtenemos resultados también con validación cruzada (CV=5) para tener resultados más realistas
dummy_clf = DummyClassifier(strategy='most_frequent',
                            random_state=random_state)
dummy_scores = cross_validate(estimator=dummy_clf,
                              X=X,
                              y=y,
                              cv=5,
                              error_score='raise',
                              return_estimator=True,
                              scoring=scoring)
print('Dummy scores: {}'.format(dummy_scores['test_score']))
print('Dummy mean score: {}'.format(np.mean(dummy_scores['test_score'])))

# Matriz de confusion
results = cross_val_predict(grid_search, X=X, y=y, cv=5)
conf_m = confusion_matrix(y, results, labels=[1, 0])
print(conf_m)

# F1_Score
print(f1_score(y, results))
Пример #39
0
    'model__gamma': [3],
    'model__max_depth': [4]
}

model = GridSearchCV(pipe, param_grid, cv=cfg["folds"], scoring='roc_auc')

logger.info("Getting best model...")
model.fit(X, y)
logger.info("Best Params: {}".format(model.best_params_))

model = model.best_estimator_
logger.info("Fitting model on upscaled X...")
model.fit(X_up, y_up)

logger.info("Predicting score (w/Cross-Val) on X...")
results = cross_val_predict(model,
                            X,
                            y,
                            cv=cfg["folds"],
                            method='predict_proba')[:, 1]
score = gini_normalized(y, results)
logger.info("normalized gini score on training set is {}".format(score))

logger.info("Loading and predicting on Test set...")
test = load_file("test")
test['target'] = model.predict_proba(test)[:, 1]
write_submission_file(test, columns=['target'], name='xgb-ups')

logger.info("Finished with time {:.3f} minutes".format(
    (time.time() - start) / 60.0))
Пример #40
0
trueY_60_20 = trueY(splitPseudonym(selectData(60, 20)))
trueY_60_10 = trueY(splitPseudonym(selectData(60, 10)))
trueY_60_30 = trueY(splitPseudonym(selectData(60, 30)))

# for train_ix, test_ix in kfold.split(X_60_20, y_60_20):
#     # select rows
#     train_X, test_X = X_60_20[train_ix], X_60_20[test_ix]
#     train_y, test_y = y_60_20[train_ix], y_60_20[test_ix]
#     print("train_X: ", train_X, type(train_X), train_X.shape)
#     print("test_X: ", test_X, type(test_X), test_X.shape)
#     print("train_y: ", train_y, type(train_y), train_y.shape)
#     print("test_y: ", test_y, type(test_y), test_y.shape)

predictions_20_20_hat = cross_val_predict(clf,
                                          X_20_20,
                                          y_20_20,
                                          cv=kfold,
                                          method='predict_proba')
predictions_40_20_hat = cross_val_predict(clf,
                                          X_40_20,
                                          y_40_20,
                                          cv=kfold,
                                          method='predict_proba')
predictions_60_20_hat = cross_val_predict(clf,
                                          X_60_20,
                                          y_60_20,
                                          cv=kfold,
                                          method='predict_proba')
predictions_60_10_hat = cross_val_predict(clf,
                                          X_60_10,
                                          y_60_10,
Пример #41
0
print("\n")
print("Model predicted for house {0} value {1}".format(did, linear_regression_prediction))
print("\n")
print("Real value for house {0} is {1}".format(did, bmd_test_target[did]))

# model evaluation
bmd_mean_square_error = mean_squared_error(bmd_test_target, linear_regression.predict(bmd_test_data))
print("\n")
print("Mean square error of a learned model: %.3f " % bmd_mean_square_error)

bmd_r2_score = r2_score(bmd_test_target, linear_regression.predict(bmd_test_data))
print("\n")
print(f"Variance score: %.3f" % bmd_r2_score)
print("\n")
print('Coefficients of a learned model: \n', linear_regression.coef_)

scores = cross_val_score(LinearRegression(), bmd['data'], bmd['target'], cv=4)
print("\n")
print(f"Cross-validation score: {scores}")

# cross_val_predict returns an array of the same size as `y` where each entry
# is a prediction obtained by cross validation:
predicted = cross_val_predict(linear_regression, bmd['data'], bmd['target'], cv=4)

fig, ax = plt.subplots()
ax.scatter(bmd['target'], predicted, edgecolors=(0, 0, 0))
ax.plot([bmd['target'].min(), bmd['target'].max()], [bmd['target'].min(), bmd['target'].max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()
Пример #42
0
scores = cross_val_score(logreg, iris.data, iris.target ,cv = 3)


# In[418]:

scores


# In[414]:

from sklearn.model_selection import cross_val_predict

iris = load_iris()
logreg = LogisticRegression()
scores = cross_val_predict(logreg, iris.data, iris.target ,cv = 5)


# In[415]:

scores


# In[420]:

from sklearn.model_selection import KFold
kfold = KFold(n_splits = 3)

cross_val_score(logreg, iris.data, iris.target, cv = kfold)

Пример #43
0
        y_train_folds = y_train[train_index]
        x_test_fold = x_train[test_index]
        y_test_fold = y_train[test_index]

        clone_clf.fit(x_train_folds, y_train_folds)
        y_pred = clone_clf.predict(x_test_fold)
        n_correct = sum(y_pred == y_test_fold)
        print('accuracy:', (n_correct / len(y_pred)))

my_cross_val_score(sgd_clf, x_train, y_train_9, cv=3)


# %%
# 混淆矩阵
from sklearn.model_selection import cross_val_predict
y_train_predict = cross_val_predict(sgd_clf, x_train, y_train_9, cv=3)
print(y_train_predict.shape)
y_train_predict[:5]

# %%
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train_9, y_train_predict)

# %%
from sklearn.metrics import precision_score, recall_score, f1_score
print('precision_score:', precision_score(y_train_9, y_train_predict))
print('recall_score:', recall_score(y_train_9, y_train_predict))
print('f1_score:', f1_score(y_train_9, y_train_predict))


# %%

# In[ ]:


lreg = LogisticRegression()
lreg_yhat= lreg.fit(X, y).predict(X)

lreg_sas = accuracy_score(y, lreg_yhat)
lreg_cv5s = cross_val_score(lreg, X, y, cv=5, n_jobs=-1).mean()
lreg_l1os = cross_val_score(lreg, X, y, cv=LeaveOneOut().split(X), n_jobs=-1).mean()
print('Self Accuracy Score : {}'.format(lreg_sas))
print('CV5 Score : {}'.format(lreg_cv5s))
print('CVLeave1Out Score : {}'.format(lreg_l1os))

lreg_pvsa_survival = np.column_stack((cross_val_predict(lreg, X, y, cv=5, n_jobs=-1), y))
print('Predicted Survival : {}'.format(lreg_pvsa_survival[:,0].mean()))
print('Actual Survival : {}'.format(lreg_pvsa_survival[:,1].mean()))
print(classification_report(y, lreg_pvsa_survival[:,0], target_names=['dead','notdead']))

cm = confusion_matrix(y,lreg_pvsa_survival[:,0])
ax = plt.axes()
sns.heatmap(cm, ax=ax, fmt='d', square=True, annot=True, vmin=0)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('LREG - Survival - Confusion Matrix')


# In[ ]:

Пример #45
0
    # pprint(y)
    # iris = sns.load_dataset("iris")
    # X = iris.values[50:150, 0:4]
    # y = iris.values[50:150, 4]

    # sns.pairplot(iris,hue='species')
    # sns.plt.show()

    # 2-nd logistic regression using sklearn

    # log-regression lib model
    log_model = LogisticRegression()
    m = np.shape(X)[0]

    # 10-folds CV
    y_pred = cross_val_predict(log_model, X, y, cv=10)
    print(metrics.accuracy_score(y, y_pred))


    # LOOCV
    # loo = LeaveOneOut()
    # accuracy = 0
    # for train, test in loo.split(X):
    #     log_model.fit(X[train], y[train])  # fitting
    #     y_p = log_model.predict(X[test])
    #     if y_p == y[test]: accuracy += 1
    # print(accuracy / np.shape(X)[0])

    # m = np.shape(X)[0]
    # scores_loo = cross_val_score(log_model, X, y, cv=m)
    # print(scores_loo)
Пример #46
0
if regression_flag == 1:
    PLScomponents = np.arange(
        1,
        min(
            np.linalg.matrix_rank(autoscaled_Xtrain) + 1,
            maxPLScomponentnumber + 1), 1)
    r2all = list()
    r2cvall = list()
    for PLScomponent in PLScomponents:
        plsmodelincv = PLSRegression(n_components=PLScomponent)
        plsmodelincv.fit(autoscaled_Xtrain, autoscaled_ytrain)
        calculatedyincv = np.ndarray.flatten(
            plsmodelincv.predict(autoscaled_Xtrain))
        estimatedyincv = np.ndarray.flatten(
            model_selection.cross_val_predict(plsmodelincv,
                                              autoscaled_Xtrain,
                                              autoscaled_ytrain,
                                              cv=fold_number))
        calculatedyincv = calculatedyincv * ytrain.std(ddof=1) + ytrain.mean()
        estimatedyincv = estimatedyincv * ytrain.std(ddof=1) + ytrain.mean()

        r2all.append(
            float(1 - sum((ytrain - calculatedyincv)**2) /
                  sum((ytrain - ytrain.mean())**2)))
        r2cvall.append(
            float(1 - sum((ytrain - estimatedyincv)**2) /
                  sum((ytrain - ytrain.mean())**2)))
    plt.plot(PLScomponents, r2all, 'bo-')
    plt.plot(PLScomponents, r2cvall, 'ro-')
    plt.ylim(0, 1)
    plt.xlabel('Number of PLS components')
    plt.ylabel('r2(blue), r2cv(red)')
        features_tfidf = pandas.DataFrame(tfidfX.todense())
        # Assign column names to make it easier to print most useful features later
        features_tfidf.columns = tfidf.get_feature_names()
        features_combined = pandas.concat([features_tfidf, derived_features],
                                          axis=1)

        logging.info('Combined features shape:')
        logging.info(features_combined.shape)

        svm_object = LogisticRegression()
        classifier = OneVsRestClassifierBalance(svm_object)

        logging.info('Getting per-class scores')
        y_pred = cross_val_predict(classifier,
                                   features_combined.values,
                                   labels_matrix,
                                   cv=10)

        logging.info('Computing overall results')
        scores_f1 = cross_val_score(classifier,
                                    features_combined.values,
                                    labels_matrix,
                                    cv=10,
                                    scoring='f1_weighted').mean()

        logging.info(classification_report(labels_matrix, y_pred, digits=3))
        logging.info('f1_weighted : {0}'.format(scores_f1))

        end = time.time()
        runtime_in_seconds = end - start
        logging.info('Processing completed in {0}'.format(runtime_in_seconds))
Пример #48
0
                            random_state=1)

# Perform cross-validation
scores = cross_val_score(cv=kf,
                                         estimator=clf,
                                         X=X_train, 
                                         y=y_train,
                                         scoring='accuracy'
                                        )
print('Scores: ' + str(scores))
print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), 2*scores.std()))

# Gather predictions
predictions = cross_val_predict(cv=kf,
                                          estimator=clf,
                                          X=X_train, 
                                          y=y_train
                                         )

accuracy_score = metrics.accuracy_score(y_train, predictions)
print('accuracy score: '+str(accuracy_score))

confusion_matrix = metrics.confusion_matrix(y_train, predictions)

class_names = encoder.classes_.tolist()


#Train the classifier
clf.fit(X=X_train, y=y_train)

model = {'classifier': clf, 'classes': encoder.classes_, 'scaler': X_scaler}
Пример #49
0
    def predict(self):
        '''do predictions using the best extreme random forest an the test set as
    well as training set with 3 cross-validation folds and doing some initial
    analysis on the output'''
        print('*' * 80)
        print('*    Predict using new forest and test/train_CV set')
        print('*' * 80)

        #try out how well the classifier works to predict from the test set
        self.y_pred = self.extra_clf_rand_new.predict(self.X_metrix_test)
        self.y_pred_proba = self.extra_clf_rand_new.predict_proba(
            self.X_metrix_test)
        with open(
                os.path.join(self.output_dir,
                             'extreme_randomforest_randomsearch.txt'),
                'a') as text_file:
            text_file.write(
                'Saving predictions and probabilities for X_metrix_test in y_pred and probabilities in y_pred_proba \n'
            )

        #alternative way to not have to use the test set
        self.y_train_CV_pred = cross_val_predict(self.extra_clf_rand_new,
                                                 self.X_metrix_train,
                                                 self.y_train,
                                                 cv=3)
        self.y_train_CV_pred_proba = cross_val_predict(self.extra_clf_rand_new,
                                                       self.X_metrix_train,
                                                       self.y_train,
                                                       cv=3,
                                                       method='predict_proba')
        with open(
                os.path.join(self.output_dir,
                             'extreme_randomforest_randomsearch.txt'),
                'a') as text_file:
            text_file.write(
                'Saving predictions and probabilities for X_metrix_train with 3-fold CV in y_train_pred \n'
            )

        print('*' * 80)
        print('*    Calculate prediction stats')
        print('*' * 80)

        def prediction_stats(y_test, y_pred, directory):
            # calculate accuracy
            y_accuracy = metrics.accuracy_score(self.y_test, y_pred)

            # examine the class distribution of the testing set (using a Pandas Series method)
            class_dist = self.y_test.value_counts()

            # calculate the percentage of ones
            # because y_test only contains ones and zeros, we can simply calculate the mean = percentage of ones
            ones = self.y_test.mean()

            # calculate the percentage of zeros
            zeros = 1 - self.y_test.mean()

            # calculate null accuracy in a single line of code
            # only for binary classification problems coded as 0/1
            null_acc = max(self.y_test.mean(), 1 - self.y_test.mean())

            with open(
                    os.path.join(directory,
                                 'extreme_randomforest_randomsearch.txt'),
                    'a') as text_file:
                text_file.write(
                    'Accuracy score or agreement between y_test and y_pred: %s \n'
                    % y_accuracy)
                text_file.write('Class distribution for y_test: %s \n' %
                                class_dist)
                text_file.write('Percent 1s in y_test: %s \n' % ones)
                text_file.write('Percent 0s in y_test: %s \n' % zeros)
                text_file.write('Null accuracy in y_test: %s \n' % null_acc)

        prediction_stats(self.y_test, self.y_pred, self.output_dir)
Пример #50
0
}

grid_log_reg = GridSearchCV(LogisticRegression(), log_reg_params)
grid_log_reg.fit(X_train, y_train)
#Nous obtenons le modèle de régression logistique avec les meilleurs paramètres:
log_reg = grid_log_reg.best_estimator_

# In[30]:

from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_predict
# Nous créons un dataframe avec tous les scores et classifiers

log_reg_pred = cross_val_predict(log_reg,
                                 X_train,
                                 y_train,
                                 cv=5,
                                 method="decision_function")

# In[31]:

from sklearn.metrics import roc_auc_score

print('Logistic Regression: ', roc_auc_score(y_train, log_reg_pred))

# In[32]:

#Nous visualisons la courbe ROC qui mesure les performances du modèle. Nous retiendrons ici la régression logistique
#à appliquer sur nos données
log_fpr, log_tpr, log_thresold = roc_curve(y_train, log_reg_pred)
Пример #51
0
#train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))]) if test and train data are not separate
#do the PCA first
input_train, input_validate, output_train, output_validate = train_test_split(
    data_train_nzv, classe_col_train, test_size=0.3, random_state=0)
std_scale = preprocessing.StandardScaler().fit(input_train)
input_train_std = std_scale.transform(input_train)
input_valid_std = std_scale.transform(input_validate)
input_test_std = std_scale.transform(data_test_nzv)
pca_std = PCA(n_components=0.9).fit(input_train_std)
input_train_std = pca_std.transform(input_train_std)
input_valid_std = pca_std.transform(input_valid_std)
data_test_nzv_std = pca_std.transform(data_test_nzv)
dt = tree.DecisionTreeClassifier()  #can provide depth value in ()
model = dt.fit(X=input_train_std, y=output_train)
predictions = cross_val_predict(model,
                                X=input_valid_std,
                                y=output_validate,
                                cv=10)  #njobs for number of CPUs
print('Decision Tree Score:',
      metrics.accuracy_score(output_validate, predictions))
#print “Score:”, model.score(X_valid, y_valid)
#Visualize Tree
tree.export_graphviz(model, out_file='tree.dot')
dot_data = StringIO()
tree.export_graphviz(model, out_file=dot_data)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png("decision_tree.png")
#fit random forest
rf = RandomForestClassifier()
model_rf = rf.fit(X=input_train_std, y=output_train)
predictions_rf = cross_val_predict(model_rf,
                                   X=input_valid_std,
def sample_and_cross_val_clf(train_size=200,
                             noise_corr=2,
                             dim=3,
                             sep=.5,
                             random_state=0):
    """ Runs an experiments and returns the corresponding lines in
        the results dataframe.
    """
    clf = LinearSVC_continuous(penalty='l2', fit_intercept=True)

    n_samples = train_size + 10000
    X, y = mk_data(n_samples=n_samples,
                   separability=sep,
                   random_state=random_state,
                   noise_corr=noise_corr,
                   dim=dim)
    X_train = X[:train_size]
    y_train = y[:train_size]
    X_test = X[train_size:]
    y_test = y[train_size:]

    validation_score = roc_auc_score(y_test,
                                     clf.fit(X_train, y_train).predict(X_test))

    #    # Create 10 blocks of evenly-spaced labels for GroupShuffleSplit
    groups = np.arange(train_size) // (train_size // 10)

    scores = list()
    for name, cv in [('10 repeated 10-fold',
                      RepeatedKFold(n_splits=10,
                                    n_repeats=10,
                                    random_state=random_state)),
                     ('50 splits',
                      GroupShuffleSplit(n_splits=50,
                                        random_state=random_state))]:
        try:
            cv_scores = cross_val_score(clf,
                                        X_train,
                                        y_train,
                                        groups=groups,
                                        scoring='roc_auc',
                                        cv=cv)
        except:
            if name == '10 repeated 10-fold':
                try:
                    cv_scores = [
                        roc_auc_score(
                            y_train,
                            cross_val_predict(clf,
                                              X_train,
                                              y_train,
                                              groups=groups,
                                              cv=10))
                    ]
                except:
                    cv_scores = [np.nan]
            else:
                cv_scores = [np.nan]

        scores.append(
            dict(cv_name=name,
                 validation_score=validation_score,
                 train_size=train_size,
                 dim=dim,
                 noise_corr=noise_corr,
                 sep=sep,
                 score_error=(np.mean(cv_scores) - validation_score),
                 score_sem=(np.std(cv_scores) / np.sqrt(len(cv_scores)))))

    return scores
Пример #53
0
for i, dicionario in enumerate(grid.cv_results_['params']):
    z = dicionario.copy()
    z.update({'mean': round(medias_teste[i], 4)})
    print(z)

#Imprime os melhores parâmetros e acurácia do melhor gaussianNB
grid.best_params_
grid.best_score_

#Criação do classificador
clf = GaussianNB(var_smoothing=1e-10)

#Visualização do score, acurácia e métricas do modelo utilizando validação cruzada
resultado = cross_val_score(clf, previsores, classe, cv=5, scoring='accuracy')
resultados = cross_val_predict(clf, previsores, classe, cv=5)
valor_classes = sorted(np.unique(classe))
print(
    f'O desvio padrão da soma de todos os folds do modelo GausianNB é de {round(resultado.std(), 4)}'
)
print(
    f'A acurácia do modelo GausianNB é de {round(metrics.accuracy_score(classe,resultados) * 100, 2)}%'
)
print(
    f'As métricas do modelo GausianNB é:\n {metrics.classification_report(classe,resultados,valor_classes)}'
)

#Criação do modelo utilizando holdout
p_treinamento, p_teste, c_treinamento, c_teste = train_test_split(
    previsores, classe, test_size=0.2, random_state=0)
clf.fit(p_treinamento, c_treinamento)
Пример #54
0
 participant = 'fcm'
 df_sub          = df[df['participant'] == participant]
 # for 1-back to 4-back
 for n_back in np.arange(1,5):
     X,y,groups = utils.get_features_targets_groups(
                             df_sub.dropna(),
                             n_back                  = n_back,
                             names                   = name_for_scale,
                             independent_variables   = feature_names,
                             dependent_variable      = [target_name,'correctness'])
     X,y,groups = shuffle(X,y,groups)
     y,correctness = y[:,0],y[:,1]
     for model_name,model in utils.make_clfs().items():
         cv = LeaveOneOut()
         print('{}-back,{}'.format(n_back,model_name))
         preds = cross_val_predict(model,X,y,groups=groups,cv=cv,method='predict',verbose=2,n_jobs=4)
         df_pred_ = pd.DataFrame(np.vstack([preds,correctness]).T,columns = ['preds','correct'])
         p_correct = float(np.sum(correctness == 1)+1) / (len(correctness)+1)
         p_incorrect = float(np.sum(correctness == 0)+1) / (len(correctness)+1)
         p_aware = float(np.sum(preds == 1)+1) / (len(preds)+1)
         p_unaware = float(np.sum(preds == 0)+1) / (len(preds)+1)
         p_correct_aware = float(np.sum(np.logical_and(correctness == 1, preds == 1))+1) / (len(df_pred_)+1)
         p_correct_unaware = float(np.sum(np.logical_and(correctness == 1, preds == 0))+1) / (len(df_pred_)+1)
         p_incorrect_aware = float(np.sum(np.logical_and(correctness == 0, preds == 1))+1) / (len(df_pred_)+1)
         p_incorrect_unaware = float(np.sum(np.logical_and(correctness == 0, preds == 0))+1) / (len(df_pred_)+1)
         correlation,pval = stats.spearmanr(preds,correctness)
         results['sub'].append(participant)
         results['model'].append(model_name)
         results['corre'].append(correlation)
         results['pval'].append(pval)
         results['p(correct|awareness)'].append(p_correct_aware/p_aware)
Пример #55
0
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)


never_5_clf = Never5Classifier()
cross_val_score2 = cross_val_score(never_5_clf,
                                   X_train,
                                   y_train_5,
                                   cv=3,
                                   scoring="accuracy")
print("cross_val_score_never5", cross_val_score2)

# 计算混淆矩阵
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)
from sklearn.metrics import confusion_matrix
confusion_matrix1 = confusion_matrix(y_train_5, y_train_pred)
print("confusion_matrix", confusion_matrix1)

# 计算查准率,查全率,F1
from sklearn.metrics import precision_score, recall_score
precision_score1 = precision_score(y_train_5, y_train_pred)
print("precision_score1", precision_score1)
recall_score1 = recall_score(y_train_5, y_train_pred)
print("recall_score1", recall_score1)

from sklearn.metrics import f1_score
f1_score = f1_score(y_train_5, y_train_pred)
print("f1_score", f1_score)
Пример #56
0
autoscaled_y_train = (y_train - y_train.mean()) / y_train.std()
autoscaled_x_test = (x_test - x_train.mean()) / x_train.std()

if method_name == 'pls':
    # CV による成分数の最適化
    components = []  # 空の list の変数を作成して、成分数をこの変数に追加していきます同じく成分数をこの変数に追加
    r2_in_cv_all = []  # 空の list の変数を作成して、成分数ごとのクロスバリデーション後の r2 をこの変数に追加
    for component in range(
            1,
            min(np.linalg.matrix_rank(autoscaled_x_train),
                max_number_of_principal_components) + 1):
        # PLS
        model = PLSRegression(n_components=component)  # PLS モデルの宣言
        estimated_y_in_cv = pd.DataFrame(
            cross_val_predict(
                model, autoscaled_x_train, autoscaled_y_train,
                cv=fold_number))  # クロスバリデーション推定値の計算し、DataFrame型に変換
        estimated_y_in_cv = estimated_y_in_cv * y_train.std() + y_train.mean(
        )  # スケールをもとに戻す
        r2_in_cv = metrics.r2_score(y_train, estimated_y_in_cv)  # r2 を計算
        print(component, r2_in_cv)  # 成分数と r2 を表示
        r2_in_cv_all.append(r2_in_cv)  # r2 を追加
        components.append(component)  # 成分数を追加

    # 成分数ごとの CV 後の r2 をプロットし、CV 後のr2が最大のときを最適成分数に
    optimal_component_number = sample_functions.plot_and_selection_of_hyperparameter(
        components, r2_in_cv_all, 'number of components', 'cross-validated r2')
    print('\nCV で最適化された成分数 :', optimal_component_number)
    # PLS
    model = PLSRegression(n_components=optimal_component_number)  # モデルの宣言
elif method_name == 'svr':
Пример #57
0
# Probabilidades de cada classe do conjunto de dados.
# Usamos o predict_proba para visualizar os dados de probabilidade. 
# A visualização será um array com cada frase em sua respectiva posição.

print (modelo.classes_)
modelo.predict_proba(freq_testes).round(2)


# In[ ]:


# Vamos realizar uma avaliação do modelo utilizando a técnica de "Cross Validation"
# Utilizaremos com 10 folds. Mais informações no link abaixo:
# https://scikit-learn.org/stable/modules/cross_validation.html

resultados = cross_val_predict(modelo, freq_tweets, classes, cv=10)


# In[ ]:


# Utilizando a matriz de confusão. Essa técnica é excelente para validação.
# Temos como analisar os dados que foram interpretados em classes "incorretas".

print (pd.crosstab(classes, resultados, rownames=['Real'], colnames=['Predito'], margins=True))


# In[ ]:


# Vamos umsar o método metrics do sklearn. Vale a pena uma leitura na documentação. 
Пример #58
0
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3,random_state=1)
print(x_train.shape)
print(x_test.shape)
# linearReg = linear_model.ElasticNet(alpha=0.001,l1_ratio=0.1)
# linearReg = linear_model.LinearRegression()
# linearReg = linear_model.LassoCV(alphas=[0.001,0.1, 0.01, 0.5, 1, 3, 5, 7, 10, 20, 100], cv=5)
linearReg = linear_model.Lasso(alpha=0.01)
# linearReg = linear_model.Ridge(alpha=0.001)
# linearReg.fit(x_train,y_train)
# linearReg = Ridge()
linearReg.fit(x_train,y_train.values.ravel())
# print(linearReg.alpha_)
print(linearReg.intercept_)
print(linearReg.coef_)
y_pred = linearReg.predict(x_test)
Y_pred = cross_val_predict(linearReg, x, y,cv=100)
print('MSE:')
print(metrics.mean_squared_error(y_test,y_pred))
print('RMSE:')
print(np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

# Y_pred = cross_val_predict(linearReg, x, y, cv=50)
print("10折交叉验证MSE:", metrics.mean_squared_error(y, Y_pred))
print("10折交叉验证RMSE:", np.sqrt(metrics.mean_squared_error(y, Y_pred)))

plt.figure()
plt.title("Model Star")
plt.xlabel("Measured")
plt.ylabel("Predicted")
plt.ylim(0,3.5)
plt.grid(True)
Пример #59
0
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

#ML Algorithm
from sklearn import linear_model
clf = linear_model.LogisticRegression(random_state=0)
clf.fit(X_train, y_train)

#Cross validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
cross_val_score(clf, X_train, y_train, cv=3, scoring='accuracy')
y_train_pred = cross_val_predict(clf, X_train, y_train, cv=3)
cm = confusion_matrix(y_train, y_train_pred)
print(cm)

from sklearn.metrics import precision_score, recall_score
print("precision score = {0:.4f}".format(precision_score(
    y_train, y_train_pred)))
print("recall score = {0:.4f}".format(recall_score(y_train, y_train_pred)))

#Predicting results, confusion matrix
y_pred = clf.predict(X_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("precision score = {0:.4f}".format(precision_score(y_test, y_pred)))
print("recall score = {0:.4f}".format(recall_score(y_test, y_pred)))
Пример #60
0
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
cols = loans.columns
train_cols = cols.drop("loan_status")
features = loans[train_cols]
target = loans["loan_status"]
lr.fit(features, target)
predictions = lr.predict(features)

## Cross Validation ##

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict

lr = LogisticRegression()
predictions = cross_val_predict(lr, features, target, cv=3)
predictions = pd.Series(predictions)

# False positives.
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives