Exemplo n.º 1
0
def cross_validation(df, mapper):
    pipe = sklearn.pipeline.Pipeline([
        ('featurized', mapper),
        ('lm', sklearn.linear_model.LinearRegression())
    ])
    result = cross_val_score(pipe, df.copy(), df.Sales)
    print(np.round(result), 2)
def test_with_car_dataframe(cars_dataframe):
    pipeline = Pipeline(
        [("preprocess", DataFrameMapper([("description", CountVectorizer())])), ("classify", SVC(kernel="linear"))]
    )
    data = cars_dataframe.drop("model", axis=1)
    labels = cars_dataframe["model"]
    scores = cross_val_score(pipeline, data, labels)
    assert scores.mean() > 0.30
Exemplo n.º 3
0
def test_with_car_dataframe(cars_dataframe):
    pipeline = Pipeline([("preprocess",
                          DataFrameMapper([
                              ("description", CountVectorizer()),
                          ])), ("classify", SVC(kernel='linear'))])
    data = cars_dataframe.drop("model", axis=1)
    labels = cars_dataframe["model"]
    scores = cross_val_score(pipeline, data, labels)
    assert scores.mean() > 0.30
Exemplo n.º 4
0
def test_with_iris_dataframe(iris_dataframe):
    pipeline = Pipeline([("preprocess",
                          DataFrameMapper([
                              ("petal length (cm)", None),
                              ("petal width (cm)", None),
                              ("sepal length (cm)", None),
                              ("sepal width (cm)", None),
                          ])), ("classify", SVC(kernel='linear'))])
    data = iris_dataframe.drop("species", axis=1)
    labels = iris_dataframe["species"]
    scores = cross_val_score(pipeline, data, labels)
    assert scores.mean() > 0.96
    assert (scores.std() * 2) < 0.04
def test_with_iris_dataframe(iris_dataframe):
    pipeline = Pipeline([
        ("preprocess", DataFrameMapper([
            ("petal length (cm)", None),
            ("petal width (cm)", None),
            ("sepal length (cm)", None),
            ("sepal width (cm)", None),
        ])),
        ("classify", SVC(kernel='linear'))
    ])
    data = iris_dataframe.drop("species", axis=1)
    labels = iris_dataframe["species"]
    scores = cross_val_score(pipeline, data, labels)
    assert scores.mean() > 0.96
    assert (scores.std() * 2) < 0.04
Exemplo n.º 6
0
def logistic_001():
    X, y = classes.get_train_data()
    y = y > 0

    remove_object = classes.RemoveObjectColumns()
    X = remove_object.fit_transform(X)

    imputer = Imputer()
    X = imputer.fit_transform(X)
    scores = []

    for i in range(X.shape[1]):
        clf = LogisticRegression()
        s = cross_val_score(clf, X[:, i], y, scoring='roc')
        scores.append((i, s))
Exemplo n.º 7
0
def cross_validation(df, mapper):
    pipe = sklearn.pipeline.Pipeline([
        ('featurized', mapper), ('lm', sklearn.linear_model.LinearRegression())
    ])
    result = cross_val_score(pipe, df.copy(), df.Sales)
    print(np.round(result), 2)
    ("one_hot_encoding", one_hot_encoding(categorical_features)),
    ("imputer", Imputer(axis=0, strategy='median')),
    ("random_forest", OneVsOneClassifier(RandomForestClassifier()))
])

kfold = KFold(n_splits=5, shuffle=True)
model = pipe_1.fit(x_train, y_train)
# model_file_path = '/Users/Aniket/Appzen/myenv/Source/semanticzen/learned_models/random_forest_baseline.pkl'
# joblib.dump(model, model_file_path)
# print '\n model : {0}'.format(model)
# print '\n Model is dumped to : {0}'.format(model_file_path)

scores = cross_val_score(
    model,  # steps to convert raw messages into models
    x_train,  # training data
    y_train,  # training labels
    cv=kfold,  # split data randomly into 10 parts: 9 for training, 1 for scoring
    scoring='accuracy',  # which scoring metric?
    n_jobs=-1,  # -1 = use all cores = faster
)

print '\n Train result : cross_validation'
print '\n Mean : {0}, std : (+/-) {1}'.format(scores.mean(), scores.std())

trained_model = model.steps[3][1]
print '\n trained_model : {0}'.format(trained_model)

y_prediction = model.predict(x_test)
report = classification_report(y_test, y_prediction)
print '\n ---------- Classification Report ------------'
print report
Exemplo n.º 9
0
def crossval():
    cv = cross_val_score(pipe, X_train, y_train, cv=5)
    print("Cross Validation Scores are: ", cv.round(3))
    print("Mean CrossVal score is: ", round(cv.mean(),3))
    print("Std Dev CrossVal score is: ", round(cv.std(),3))
Exemplo n.º 10
0
for i, v in table_y.iteritems():
    print("\t" + i + " : " + repr(v))
table_y = table['SOILCLASS'].value_counts()
print("Dataset features a total of " + repr(len(table_y)) + " soil classes.")
for i, v in table_y.iteritems():
    print("\t" + i + " : " + repr(v))

print("Training and evaluating classifier through 10-fold cross-validation...")
classifier = XGBClassifier(n_estimators=100, n_jobs=5)
classifier = sklearn.ensemble.RandomForestClassifier(n_estimators=1000,
                                                     n_jobs=5)
pipe = sklearn.pipeline.Pipeline([('featurize', mapper),
                                  ('classify', classifier)])
aux = cross_val_score(
    pipe,
    X=table,
    y=table.SOILCLASS,
    scoring=make_scorer(classification_report_with_accuracy_score),
    cv=10)
print("Overall results...")
print("Accuracy : " + repr(aux.mean()))
classification_report_with_accuracy_score(test_results_y_true,
                                          test_results_y_pred)

print("Training classification model on complete dataset...")
train_data = mapper.fit_transform(table)
classifier.fit(train_data[0:train_data.shape[0], 1:train_data.shape[1]],
               train_data[0:train_data.shape[0], 0])
joblib.dump(classifier, 'classification-model.joblib')

print("Infering the feature ranking within the classification model...")
if isinstance(classifier, XGBClassifier):
Exemplo n.º 11
0
def classifiers_comparison():
    classifiers = [
        ("Regresja logistyczna",
         LogisticRegression(),
         {'classifier__C': 5.0}),

        ("Naiwny klas. bayesowski",
         MultinomialNB(),
         {'classifier__alpha': 0.1}),

        ("SVM (liniowy)",
         SVC(kernel='linear', probability=True),
         {'classifier__C': 3.5,
          'features__text_words': 500, 'features__subject_words': 50}),

        ("SVM (RBF)",
         SVC(kernel='rbf', probability=True),
         {'classifier__C': 0.5, 'classifier__gamma': 0.1,
          'features__text_words': 500, 'features__subject_words': 50}),

        ("Las drzew losowych",
         RandomForestClassifier(),
         {'classifier__n_estimators': 100}),
    ]

    clf_count = len(classifiers)

    train_mails = parse_mails(COMPLETE_ALL['filename'])
    train_labels = COMPLETE_ALL['label']
    plt.figure(figsize=(8, 12))
    for (clf_name, clf, params), (ls, lc) in zip(classifiers,
                                                 linestyles_gen()):
        model = AntispamModel(clf)
        model.spam_filter.set_params(**params)
        cv = StratifiedKFold(train_labels, 5)
        scorer = ROCScorer(params.keys())
        cross_val_score(model.spam_filter, train_mails, train_labels,
                        cv=cv, scoring=scorer, verbose=2)
        score = scorer.interp_scores.values()[0]
        label = clf_name
        plt.subplot(2, 1, 1)
        score.plot(label=label, lc=lc, ls=ls, fill_alpha=0.5 / clf_count)
        plt.subplot(2, 1, 2)
        score.plot(label=label, lc=lc, ls=ls, fill_alpha=0.5 / clf_count)
    plt.subplot(2, 1, 1)
    plt.grid(True)
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.legend(loc='lower right', fontsize='medium')
    plt.gca().add_patch(
        plt.Rectangle((0, 0.8), 0.2, 0.2, ls='dashed', fc='none')
    )
    plt.xlim(-0.05, 1)
    plt.ylim(0, 1.05)
    plt.subplot(2, 1, 2)
    plt.grid(True)
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.xlim(0, 0.2)
    plt.ylim(0.8, 1)
    plt.savefig('doc/charts/ROC_ALL.png')
    plt.show()
Exemplo n.º 12
0
 def cv_score(self, train_data, labels):
     return cross_val_score(self.spam_filter, train_data, labels,
                            score_func=f1_score)
Exemplo n.º 13
0
model = LinearRegression(normalize=True)

model.fit(X_train,np.log1p(y_train))
pred = np.exp(model.predict(X_test))-1

from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test,pred))

from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import Ridge
pipe_ridge = make_pipeline(preprocessing_features, Ridge())
param_grid = {'ridge__alpha' : [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100]}
pipe_ridge_gs = GridSearchCV(pipe_ridge, param_grid=param_grid, scoring = 'neg_mean_squared_error', cv=3)
result = np.sqrt(-cross_val_score(pipe_ridge_gs, X_train, np.log1p(y_train), scoring = 'neg_mean_squared_error', cv = 5))
np.mean(result)

pipe_ridge_gs.fit(X_train, np.log1p(y_train))
predicted = np.exp(pipe_ridge_gs.predict(X_test)) -1

predicted= predicted.round()
print(mean_absolute_error(y_test,predicted))

df_TEST = pd.read_csv(path+file_test)
df_TEST.week_start_date = pd.to_datetime(df_TEST.week_start_date, yearfirst=True)

predicted_TEST = np.exp(pipe_ridge_gs.predict(df_TEST)) -1

pd.DataFrame(predicted_TEST).to_csv(path+'TEST.csv')
Exemplo n.º 14
0
                                         alpha=0.0001,
                                         learning_rate='adaptive',
                                         learning_rate_init=0.001,
                                         max_iter=1000))])
 # 模型拟合
 nn.fit(data, label)
 # 模型预测
 nn_predict = nn.predict(X_test)
 # 模型评估
 # 基础打分
 nn_score = nn.score(X_test, y_test)
 print(nn_score)
 # 交叉验证
 nn_cross1 = cross_val_score(nn,
                             X_train,
                             y_train,
                             scoring='accuracy',
                             cv=10,
                             n_jobs=-1)
 nn_cross2 = cross_val_score(nn,
                             X_test,
                             y_test,
                             scoring='accuracy',
                             cv=10,
                             n_jobs=-1)
 print(nn_cross1)
 print(nn_cross2)
 # #     scores1.append(nn_cross1.mean())
 # #     scores2.append(nn_cross2.mean())
 # #     print(nn_cross1.mean())
 # #     print(nn_cross2.mean())
 # # plt.plot(scores1, linestyle='-', color='r', label='train')
    ('UHDICM30', None), ('UHDICM40', None), ('LHDICM00', None),
    ('LHDICM10', None), ('LHDICM20', None), ('LHDICM30', None),
    ('LHDICM40', None), ('CRFVOL00', None), ('CRFVOL10', None),
    ('CRFVOL20', None), ('CRFVOL30', None), ('CRFVOL40', None),
    ('SNDPPT00', None), ('SNDPPT10', None), ('SNDPPT20', None),
    ('SNDPPT30', None), ('SNDPPT40', None), ('SLTPPT00', None),
    ('SLTPPT10', None), ('SLTPPT20', None), ('SLTPPT30', None),
    ('SLTPPT40', None), ('CLYPPT00', None), ('CLYPPT10', None),
    ('CLYPPT20', None), ('CLYPPT30', None),
    ('CLYPPT40', None), ('BLD00', None), ('BLD10', None), ('BLD20', None),
    ('BLD30', None), ('BLD40', None), ('PHIHOX00', None), ('PHIHOX10', None),
    ('PHIHOX20', None), ('PHIHOX30', None), ('PHIHOX40', None),
    ('PHIKCL00', None), ('PHIKCL10', None), ('PHIKCL20', None),
    ('PHIKCL30', None), ('PHIKCL40', None), ('ORCDRC00', None),
    ('ORCDRC10', None), ('ORCDRC20', None), ('ORCDRC30', None),
    ('ORCDRC40', None), ('CECSUM00', None), ('CECSUM10', None),
    ('CECSUM20', None), ('CECSUM30', None), ('CECSUM40', None)
])
table_y = table_y['WRB_2006_NAMEf_2'].value_counts()
print("Dataset features a total of " + repr(len(table_y)) + " soil classes.")
print("Training and evaluating classifier through 10-fold cross-validation...")
classifier = sklearn.ensemble.RandomForestClassifier(n_estimators=100)
#classifier = GCForest(get_gcforest_config())
pipe = sklearn.pipeline.Pipeline([('featurize', mapper),
                                  ('classify', classifier)])
cross_val_score(pipe,
                X=table,
                y=table.WRB_2006_NAMEf_2,
                scoring=make_scorer(classification_report_with_accuracy_score),
                cv=10)