예제 #1
0
def score_model(dataset):
    train, test, targets = recover_train_test_target(dataset)

    randomForestClassifier = RandomForestClassifier(n_estimators=50, max_features='sqrt')
    randomForestClassifier = randomForestClassifier.fit(train, targets)

    DataExploration.show_variable_relation_with_survival(train, randomForestClassifier)

    model = SelectFromModel(randomForestClassifier, prefit=True)
    train_reduced = model.transform(train)
    test_reduced = model.transform(test)

    parameters = {'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50,
                  'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6}

    model = RandomForestClassifier(**parameters)
    model.fit(train, targets)

    print 'Score: ', compute_score(model, train, targets, scoring='accuracy')

    output = model.predict(test).astype(int)
    df_output = pd.DataFrame()
    aux = pd.read_csv('data/test.csv')
    df_output['PassengerId'] = aux['PassengerId']
    df_output['Survived'] = output
    df_output[['PassengerId', 'Survived']].to_csv('data/solution.csv', index=False)
class randForestClassifier(AbstractModelClass):

    def __init__(self,n_estimators):
        self.n_estimators=n_estimators
        self.create_model()

    def create_model(self):
        self.classifier =  SelectFromModel(RandomForestClassifier(self.n_estimators))

    def fit(self,data,labels):
        self.classifier.fit(data,labels)

    def classifier_prediction(self,x_test):
        return self.classifier.predict(x_test)

    def get_support(self):
        return self.classifier.get_support()

    def get_SelectedFeatures(self, feature_data ):
        return feature_data.columns[self.get_support()]

    def get_FeatureImportance(self):
        feature_ranked = self.classifier.estimator_.feature_importances_
        ranked_feature_indices = np.argsort(feature_ranked)[::-1]
        return ranked_feature_indices , feature_ranked
예제 #3
0
def randomForest(train, test, targets):
    clf = RandomForestClassifier(n_estimators=50, max_features='sqrt')
    clf = clf.fit(train, targets)
    model = SelectFromModel(clf, prefit=True)

    run_gs = False
    if run_gs:
        parameter_grid = {
            'max_depth': [4, 6, 8],
            'n_estimators': [50, 10],
            'max_features': ['sqrt', 'auto', 'log2'],
            'min_samples_split': [1, 3, 10],
            'min_samples_leaf': [1, 3, 10],
            'bootstrap': [True, False],
        }
        forest = RandomForestClassifier()
        cross_validation = StratifiedKFold(targets, n_folds=5)

        grid_search = GridSearchCV(forest,
                                   scoring='accuracy',
                                   param_grid=parameter_grid,
                                   cv=cross_validation)

        grid_search.fit(train, targets)
        model = grid_search
        parameters = grid_search.best_params_

        print('Best score: {}'.format(grid_search.best_score_))
        print('Best parameters: {}'.format(grid_search.best_params_))
    else:
        parameters = {
            'bootstrap': False,
            'min_samples_leaf': 3,
            'n_estimators': 50,
            'min_samples_split': 10,
            'max_features': 'sqrt',
            'max_depth': 6
        }

        model = RandomForestClassifier(**parameters)
        model.fit(train, targets)

    output = model.predict(test).astype(int)
    df_output = pd.DataFrame()
    aux = pd.read_csv('../input/test.csv')
    df_output['PassengerId'] = aux['PassengerId']
    df_output['Survived'] = output
    df_output[['PassengerId', 'Survived']].to_csv('../predition.csv',
                                                  index=False)
예제 #4
0
        y_test_pred = []
        y_pred = []
        for col in select_models:
            test_preds = []
            preds = []
            for model in models[col]:
                test_preds.append(model.predict(x_test))
                preds.append(model.predict(x_pred))
            test_pred = np.mean(test_preds, axis=0)
            pred = np.mean(preds, axis=0)

            y_test_pred.append(test_pred)
            y_pred.append(pred)
        selection_model.fit(select_x_train, y_train[:, i])

        y_pred = selection_model.predict(select_x_test)
        r2 = r2_score(y_test[:, i], y_pred)
        mae = MAE(y_test[:, i], y_pred)
        print(selection_model.best_params_)
        if mae <= best_mae:
            print("예아~")
            best_mae = mae
            best_model = selection_model
            best_y_pred = selection_model.predict(select_x_pred)
            best_y_test_pred = y_pred
        print("Thresh=%.3f, n=%d, MAE: %.5f R2: %.2f%%" %
              (thresh, select_x_train.shape[1], mae, r2 * 100))
    final_y_pred.append(best_y_pred)
    final_y_test_pred.append(best_y_test_pred)

y_test_pred = []
예제 #5
0
plt.show()

# In[ ]:

start = time.clock()
model = XGBClassifier(booster='gbtree',
                      max_depth=5,
                      eval_metric='auc',
                      learning_rate=0.7,
                      min_child_weight=0.9,
                      verbose_eval=True)
model.fit(DataFrame(X_train, dtype='float'), DataFrame(y_train))
end = time.clock()
print('训练模型的时间为' + str(end - start))
start = time.clock()
y_pred = model.predict(DataFrame(X_test, dtype='float'))
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print(classification_report(y_test, predictions))
end = time.clock()
print('预测163个结果,所需时间为' + str(end - start))

# In[ ]:

print(model)

# In[ ]:

xgb.to_graphviz(model, num_trees=10)
class Predictor:
    def __init__(self):
        self._regressor = None
        self._model = None
        self._X_train = None
        self._X_test = None
        self._y_train = None
        self._y_test = None

        data = self._read_data()
        self._create_features(data)
        self._create_regressor()
        self._train_model()

    def simplify(self, test_data):
        test_data = test_data.loc[:,
                                  test_data.columns.isin(self.
                                                         _filtered_features)]
        test_data = test_data.reindex(self._filtered_features, axis=1)
        self._X_test = self._X_test.reindex(self._filtered_features, axis=1)
        parameters = {
            'bootstrap': False,
            'min_samples_leaf': 3,
            'n_estimators': 50,
            'min_samples_split': 10,
            'max_features': 'sqrt',
            'max_depth': 6
        }
        self._model = RandomForestRegressor(**parameters)
        self._model.fit(self._X_train, self._y_train)
        return test_data

    def predict(self, test_data, output_datatype):
        return self._model.predict(test_data).astype(output_datatype)

    def _read_data(self):
        if path.exists(DATASET_NAME):
            data = pd.read_pickle(DATASET_NAME)
            return differential_vector(data)
        frames = [pd.read_pickle(match) for match in \
                  glob('matches/*/*')]
        data = pd.concat(frames)
        data.drop_duplicates(inplace=True)
        data = filter_stats(data)
        data = data.dropna()
        data['home_free_throw_percentage'].fillna(0, inplace=True)
        data['away_free_throw_percentage'].fillna(0, inplace=True)
        data['points_difference'] = data['home_points'] - data['away_points']
        return differential_vector(data)

    def _create_features(self, data):
        X = data.drop('points_difference', 1)
        y = data['points_difference']
        split_data = train_test_split(X, y)
        self._X_train, self._X_test, self._y_train, self._y_test = split_data

    def _create_regressor(self):
        reg = RandomForestRegressor(n_estimators=50, max_features='sqrt')
        self._regressor = reg.fit(self._X_train, self._y_train)

    def _train_model(self):
        train = self._X_train
        self._model = SelectFromModel(self._regressor,
                                      prefit=True,
                                      threshold=0.01)
        self._X_train = self._model.transform(self._X_train)
        new_columns = train.columns[self._model.get_support()]
        self._filtered_features = [str(col) for col in new_columns]
예제 #7
0

#Tree-based feature selection
#################   C5-   Random Forrest
from sklearn.feature_selection import SelectFromModel
#use select from model to select those features which importance is greater than the mean importance of all the features by default
rf_sel = RandomForestClassifier(n_estimators = 20).fit(x_train[selected_feat], y_train)

#save model
joblib.dump(rf_sel, 'q2c_rf_fea_sel.pkl')

#load model back into spyder
rf_sel = joblib.load('q2c_rf_fea_sel.pkl')

# 4 - create prediction and convert it into dataframe
df_pred = pd.DataFrame(rf_sel.predict(x_test[selected_feat]),columns = ['prediction'])

#5 merge predction back to test dataset
final_df = pd.merge(adult_test,df_pred,how = 'left',left_index = True, right_index = True)

##Export data as CSV
final_df.to_csv('q2c_rf_fea_sel.csv', index=False)



#### Q2d - ensemble method
## voting

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
예제 #8
0
for alg in algorithms:
    # Fit the algorithm using the full training data.
    alg.fit(train, targets)
    # Predict using the test dataset.  We have to convert all the columns to floats to avoid an error.
    predictions = alg.predict_proba(test.astype(float))[:,1]
    full_predictions.append(predictions)
predictions = (full_predictions[0] + full_predictions[1]*2 + full_predictions[2]) / 4
models = [logreg_cv, rf, gboost]
for model in models:
    print('Cross-validation of : {0}'.format(model.__class__))
    score = compute_score(clf=model, X=train_reduced, y=targets, scoring='accuracy')
    print ('CV score = {0}'.format(score))
    print('****')
rf.fit(train, targets)
"""
predictions = model.predict(test[predictors])
predictions[predictions > 0.5] = 1
predictions[predictions <= 0.5] = 0
predictions = predictions.astype(int)
submission = pd.DataFrame({
    "PassengerId": titanic_test["PassengerId"],
    "Survived": predictions
})
print(submission)
submission.to_csv('/Users/martin_yan/Desktop/submission3.csv', index=False)
"""
# 用到的特征
# 1.线性回归
# alg = LinearRegression()
# 2.逻辑回归
# alg= LogisticRegression(random_state=1)
예제 #9
0
def predictWithFeatureSelectionNNConst(X, y, topN, size, learning_rate,
                                       n_iter):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=size,
                                                        random_state=0,
                                                        shuffle=True)
    clf = ExtraTreesClassifier(n_estimators=100)
    clf = clf.fit(X_train, y_train)

    print('Feature Importances')
    print(clf.feature_importances_)
    for feature in zip(X.columns, clf.feature_importances_):
        print(feature)
    feature_importance_normalized = np.std(
        [tree.feature_importances_ for tree in clf.estimators_], axis=0)

    XFeatures = list()

    model = SelectFromModel(clf,
                            prefit=True,
                            threshold=-np.inf,
                            max_features=topN)
    X_train = model.transform(X_train)
    X_test = model.transform(X_test)

    print(model.get_support(indices=True))
    for feature_list_index in model.get_support(indices=True):
        XFeatures.append(X.columns[feature_list_index])

    print('Selected Features')
    print(XFeatures)

    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.fit_transform(X_test)

    model = MLPClassifier(hidden_layer_sizes=(100, ),
                          activation='relu',
                          solver='adam',
                          alpha=0.0001,
                          batch_size='auto',
                          learning_rate='constant',
                          learning_rate_init=learning_rate,
                          max_iter=n_iter,
                          shuffle=True,
                          random_state=None,
                          verbose=True,
                          warm_start=False,
                          momentum=0.9,
                          nesterovs_momentum=True,
                          early_stopping=False,
                          validation_fraction=0.1,
                          beta_1=0.9,
                          beta_2=0.999,
                          epsilon=1e-08,
                          n_iter_no_change=10)

    model.fit(X_train, y_train)
    joblib.dump(model, 'dataset/mlp_class.jbl')
    stat = list()
    # Evaluate on training data
    print('\n-- Training data --')
    predictions = model.predict(X_train)
    accuracy = metrics.accuracy_score(y_train, predictions)
    print('Accuracy: {0:.2f}'.format(accuracy * 100.0))
    print('Classification Report:')
    print(metrics.classification_report(y_train, predictions))
    print('Confusion Matrix:')
    print(metrics.confusion_matrix(y_train, predictions))
    print('')
    stat.append(round(accuracy * 100.0, 2))
    # Evaluate on test data
    print('\n---- Test data ----')
    predictions = model.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, predictions)
    print('Accuracy: {0:.2f}'.format(accuracy * 100.0))
    print('Classification Report:')
    print(metrics.classification_report(y_test, predictions))
    print('Confusion Matrix:')
    print(metrics.confusion_matrix(y_test, predictions))
    stat.append(round(accuracy * 100.0, 2))
    stat.append(learning_rate)

    plt.plot(model.loss_curve_)
    plt.xlabel('Epoch')
    plt.ylabel('Value')
    plt.title('Model loss for relu activation and solver adam')
    plt.show()
    return stat
예제 #10
0
modeler = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier()
]

S_X_train, S_X_test = stacking(modeler,
                               X_train,
                               y_train,
                               X_test,
                               regression=False,
                               metric=metrics.log_loss,
                               needs_proba=True,
                               stratified=True,
                               shuffle=True,
                               random_state=42,
                               verbose=2)

# %%
model = LogisticRegression(penalty='l1', C=1, random_state=42)

model = model.fit(S_X_train, y_train)

y_pred = pd.Series(model.predict(S_X_test))
y_pred_proba = model.predict_proba(S_X_test)[:, 1]

print("R Square:", metrics.accuracy_score(y_test, model.predict(S_X_test)))
print("kappa:", metrics.cohen_kappa_score(y_test, model.predict(S_X_test)))

# %%
예제 #11
0
            if type_col[c]==1:
                print "Catagorical var %s selected \n "%ki
                p+=1
            break
        c+=1
                
print "attributes cat--",p
print list_already_taken



X_train, X_test, y_train, y_test = train_test_split(X_new_df, Y, test_size=0.3, random_state=0)

print X_train.shape
print X_test.shape
print y_train.shape
print y_test.shape


regr = linear_model.LinearRegression()
model = regr.fit(X_train, y_train)

Y_res = model.predict(X_test)

print("Mean squared error: %.2f"% mean_squared_error(y_test, Y_res))
print('Variance score: %.2f' % r2_score(y_test, Y_res))


print "END"

        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y_b[train_index], y_b[test_index]

        start = time.time()

        lasso = Lasso(alpha=alpha[idx]).fit(x_train, y_train)
        model = SelectFromModel(lasso, prefit=True)
        x_train = model.transform(x_train)
        x_test = model.transform(x_test)

        #create NN model
        model = build_model()
        model.fit(x_train, y_train, epochs=10, batch_size=100, verbose=1)

        prediction = model.predict(x_test)

        end = time.time()
        save_data[f"{classifier_condition}_fold_{z+1}_n={alpha[idx]}"] = (model_evaluation(f"{classifier_condition}, {alpha[idx]}", f"fold_{z+1}", x_test, y_test, prediction, model, end-start, n_classes))

        z+=1

    range = ((idx*10) + (idx + 1))
    save_data[f"Average {classifier_condition}, n = {alpha[idx]}"] = save_data.iloc[:,range:].mean(axis=1)

save_data.to_csv(f"{classifier_condition}_new.csv")

#-------------------------------------------
# RUN CLASSIFIER WITH ISOMAP IMPLEMENTATION
#-------------------------------------------
'''ISOMAP is so slow that the value of n_components is manually adjusted;
# to find best model, we try 3 diff ones
svml = SVC()
gboost = GradientBoostingClassifier()
rf = RandomForestClassifier(n_estimators=100)
logreg = LogisticRegressionCV()
gaus = GaussianNB()
knear = KNeighborsClassifier()


models = [logreg, svml, rf, gboost,knear,gaus]

#for model in models:
#    print("Cross-validating: {0}".format(model.__class__))
#    score = compute_score(clf=model,x=train_x_reduced, y=train_y)
#    print("Accuracy of model: {0}".format(score))
#    print("*************")
    
model = GradientBoostingClassifier()
model.fit(train_x,train_y)
output = model.predict(test_x).astype(int) # so we don't get floats

passIDs = pd.read_csv("test_titanic.csv")
results = pd.DataFrame()
results["PassengerId"] = passIDs["PassengerId"]
results["Survived"] = output
print(results.shape)
results.to_csv("titanicsubmission.csv",index=False)


예제 #14
0
X_train,X_test,y_train,y_test=train_test_split(train_reduced,targets,test_size=0.2,random_state=0)

parameters = {'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50, 
                  'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6}
    
model = RandomForestClassifier(**parameters)
model.fit(X_train, y_train)

model2=GradientBoostingClassifier()
model2.fit(X_train,y_train)

model3=SVC()
model3.fit(X_train, y_train)
compute_score(model, X_test, y_test, scoring='accuracy')

y=model.predict(test_reduced)
testset=pd.read_csv('test.csv')
my_submission = pd.DataFrame({'PassengerId': testset.PassengerId, 'Survived': y})
my_submission.to_csv('pred2.csv', index=False)

#To test different hyperparameter combinations of RandomForestClassifier
'''run_gs = True

if run_gs:
    parameter_grid = {
                 'max_depth' : [4, 6, 8],
                 'n_estimators': [50, 10],
                 'max_features': ['sqrt', 'auto', 'log2'],
                 'min_samples_split': [1.0, 3, 10],
                 'min_samples_leaf': [1, 3, 10],
                 'bootstrap': [True, False],
예제 #15
0
#ans = bdt_discrete.predict(final_model_test_data);
ans = clf_2.predict(final_model_test_data)

#ans = clf_3.predict(final_model_test_data);
#ans = clf_4.predict(final_model_test_data);

for mn in range(len(ans)):
    if (ans[mn] == 0):
        classify = 0
    else:
        classify = 1
##############################1ST######################################################
    if (classify == 0):
        q = np.matrix([final_model_test_data[mn]])
        predictions = model.predict(q)
        r.append(predictions)
#######################################################################################
    elif classify == 1:
        q = np.matrix([final_model_test_data[mn]])
        predictions = model_2.predict(q)
        r.append(predictions)
#######################################################################################

predicted_results = r

for t in range(len(r)):
    if (r[t] - int(r[t])) > 0.5:
        predicted_results[t] = math.ceil(r[t] * 10 / 10)
    else:
        predicted_results[t] = int(r[t])
예제 #16
0
                          min_child_samples=6,
                          min_child_weight=0,
                          subsample=0.8,
                          colsample_bytree=0.7,
                          reg_alpha=0,
                          importance_type="split")

for train_ix, test_ix in cv.split(X_train):

    X_cvtrain, X_cvtest = X_train.iloc[train_ix, :], X_train.iloc[test_ix, :]
    y_cvtrain, y_cvtest = y_train["y"].iloc[train_ix], y_train["y"].iloc[
        test_ix]

    model.fit(X_cvtrain, y_cvtrain)

    predtrain = model.predict(X_cvtrain)
    pred = model.predict(X_cvtest)

    print("\nTrain R2:")
    print(np.round(r2_score(y_cvtrain, predtrain), 2))
    print("\nTest R2:")
    print(np.round(r2_score(y_cvtest, pred), 2))
    print("\n________________________")

    R2.append(np.round(r2_score(y_cvtest, pred), 4))

print("\nAverage R2:", round(np.sum(R2) / 5, 2))
print("Std:", round(np.std(R2), 4))

# Predict Test Data
예제 #17
0
imp_feat.nlargest(20).plot(kind='barh')
ind = np.argsort(imp)
rf.feature_importances_

# Stacking model
xcl_train, xcl_test, ycl_train, ycl_test = train_test_split(x_cl,
                                                            y_cl,
                                                            test_size=0.3)
#Support vector classifier
#svm=SVC(C=5, probability=True,gamma='auto')
#svm.fit(xcl_train,ycl_train)
#
#
lr = LogisticRegressionCV(cv=10)
lr.fit(x_cl, y_cl)
metrics.accuracy_score(y_cl, lr.predict(x_cl))
metrics.roc_auc_score(y_cl, lr.predict(x_cl))
#nn = MLPClassifier((80, 10), early_stopping=False, random_state=SEED)
gb = GradientBoostingClassifier(n_estimators=100)
gb.fit(xcl_train, ycl_train)
metrics.accuracy_score(ycl_test, gb.predict(xcl_test))
#rf = RandomForestClassifier(n_estimators=10, max_features=3, random_state=SEED)

#Predicting Monthly Revenue
reg_df = temp
cl_df['Churn']
reg_df['Revenue_loss'] = cl_df.Churn * reg_df['MonthlyRevenue']
reg_df['Revenue_loss']
reg_df = reg_df.drop(['MonthlyRevenue'], axis=1)
#Casting categorical types
for n, v in reg_df.iteritems():
예제 #18
0
random_forest = GridSearchCV(RandomForestClassifier(class_weight="balanced",
                                                    random_state=123),
                             rf_param_grid,
                             cv=kfold,
                             n_jobs=-1,
                             refit=True,
                             scoring="roc_auc")

random_forest.fit(X_train_rf_selected, y_train)

print(
    f'Best score: {random_forest.best_score_} with param: {random_forest.best_params_}'
)

X_test_rf_selected = X_test[X_test.columns.intersection(rf_selected_features)]
y_rf_predictions = random_forest.predict(X_test_rf_selected)

conf_matrix = metrics.confusion_matrix(y_test, y_rf_predictions)
sns.heatmap(pd.DataFrame(conf_matrix), annot=True, fmt='g', cmap='coolwarm_r')
plt.title('Random Forests')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
#plt.savefig('RF_CM.png', quality=95)
plt.show()

print(f"Accuracy: {metrics.accuracy_score(y_test, y_rf_predictions)}")
print(classification_report(y_test, y_rf_predictions))

y_pred_prob_rf = random_forest.predict_proba(X_test_rf_selected)[::, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_prob_rf)
auc = metrics.roc_auc_score(y_test, y_pred_prob_rf)
예제 #19
0
def main():


    data = pd.read_csv('selfie_dataset.txt', 
                        sep=" ", 
                        header=None, 
                        names=["Nome","Rate", "partial_faces", "is_female", "baby", "child","teenager", "youth", "middle_age","senior", "white", "black","asian", "oval_face", "round_face",
                                "heart_face", "smiling", "mouth_open","frowning", "wearing_glasses", "wearing_sunglasses","wearing_lipstick","2tongue_out0", "duck_face","black_hair",
                                 "blond_hair", "brown_hair","red_hair", "curly_hair", "straight_hair","braid_hair", "showing_cellphone", "using_earphone","using_mirror", "wearing_hat"
                                 ,"braces","harsh_lighting","dim_lighting"])

                                  
    

    
    labels = np.array(data['Rate'])
    features1= data.drop("Rate", axis = 1)
    features= features1.drop("Nome", axis = 1)

    feature_list = list(features.columns)
    features = np.array(features)


    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.1,random_state=0)

    sc = StandardScaler()
    train_features = sc.fit_transform(train_features)
    test_features = sc.transform(test_features)

    print('The shape of our train_features is:', train_features.shape)
    print('The shape of our test_features is:', test_features.shape)


    isTrained = False
    min_importance = 0.04
    n_estimators = 200
    retrain = True

    if(isTrained):

        if(retrain):
            crf = joblib.load("regressor.pkl")

            rf = SelectFromModel(crf, threshold=min_importance)
            rf.fit(train_features, train_labels)

            train_features = rf.transform(train_features)
            test_features = rf.transform(test_features)

            print('The shape of our important_train_features is:', train_features.shape)
            print('The shape of our important_test_features is:', test_features.shape)

            rf_important = RandomForestRegressor(n_estimators=n_estimators,random_state=1)


            rf_important.fit(train_features, train_labels)

            rf = rf_important

            print(rf_important)
            print("\n\n")
            predictions = rf_important.predict(test_features)
            importances = list(rf_important.feature_importances_)
            
        else:
            rf = joblib.load("regressor.pkl")
            print(rf)
            print("\n\n")
            predictions = rf.predict(test_features)
            importances = list(rf.feature_importances_)
    
    else:

        rf = RandomForestRegressor(n_estimators = n_estimators,oob_score=True,random_state=2)
        rf.fit(train_features, train_labels)
        joblib.dump(rf, 'regressor.pkl') 

        print(rf)
        print("\n\n")
        predictions = rf.predict(test_features)
        importances = list(rf.feature_importances_)

    
    
    print('Mean Absolute Error:', mean_absolute_error(test_labels,predictions))
    mape = np.mean(np.abs((test_labels - predictions) / test_labels)) * 100
    accuracy = 100 - mape
    print('Accuracy:', round(accuracy, 2), '%')

    print('Variance Score: ', explained_variance_score(test_labels,predictions))

    print("\n\n")

    print("Importances: ")
    importances = list(rf.feature_importances_)
    feature_importances = [(feature, round(importance, 4)) for feature, importance in zip(feature_list, importances)]
    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
    for pair in feature_importances:
        print('{} : {}'.format(*pair))


    print()
예제 #20
0
#print(test_reduced.shape)

parameters = {
    'bootstrap': False,
    'min_samples_leaf': 3,
    'n_estimators': 50,
    'min_samples_split': 10,
    'max_features': 'sqrt',
    'max_depth': 6
}

model = RandomForestClassifier(**parameters)
model.fit(training_df_processed, survived_column)

print(
    compute_score(model,
                  training_df_processed,
                  survived_column,
                  scoring='accuracy'))

output = model.predict(test_df_processed).astype(int)

df_output = pandas.DataFrame()
aux = pandas.read_csv('D:/ML work/Titanic Data/test.csv')
df_output['PassengerId'] = aux['PassengerId']
df_output['Survived'] = output
df_output[['PassengerId',
           'Survived']].to_csv('D:/ML work/Titanic Data/output.csv',
                               index=False)
예제 #21
0
parameters = {
    'bootstrap': False,
    'min_samples_leaf': 4,
    'n_estimators': 50,
    'min_samples_split': 10,
    'max_features': 'sqrt',
    'max_depth': 5
}

model = RandomForestClassifier(**parameters)
model.fit(X_train_reduced, Y_train)

# In[133]:

output = model.predict(test1_reduced).astype(int)
model1 = round(model.score(X_train_reduced, Y_train) * 100, 2)
model1

# #### Applying Random Forest Classifier. One can play with parameters (hyperparameter tuning to  increase score). I have achieved  .803 with less feature engineering. However, as i increased the number of dummies for age, it came down to 78.9.

# In[131]:

output = model.predict(test1_reduced).astype(int)
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": output
})
submission.to_csv("titanic51_submission.csv", index=False)

#  I'll update as i improve. Your guidance is appreciated. Also thanks a lot for all the tutorials where i learned a lot.
## Evaluation (No tuning) ------------> (1)
prediction = rf_classifier.predict(X_test)
print('Confusion Matrix\n', confusion_matrix(y_test, prediction))
print('Accuracy Score: ', accuracy_score(y_test, prediction))
print('Classification Report:\n', classification_report(y_test, prediction))

### Manual Hyperparameter Tuning
model = RandomForestClassifier(n_estimators=300,
                               criterion='entropy',
                               max_features='sqrt',
                               min_samples_leaf=10,
                               random_state=100).fit(X_train, y_train)

## Evaluation (Manual tuning) ------------> (2)
predictions = model.predict(X_test)
print(confusion_matrix(y_test, predictions))
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

### Randomized Search Cv
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
max_features = ['auto', 'sqrt', 'log2']
max_depth = [int(x) for x in np.linspace(10, 1000, 10)]
min_samples_split = [2, 5, 10, 14]
min_samples_leaf = [1, 2, 4, 6, 8]
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
예제 #23
0
else:
    parameters = {
        'bootstrap': False,
        'min_samples_leaf': 3,
        'n_estimators': 50,
        'min_samples_split': 10,
        'max_features': 'sqrt',
        'max_depth': 6
    }

    model = RandomForestClassifier(**parameters)
    model.fit(train, targets)

# In[138]:

Y_pred = model.predict(test).astype(int)

# ## Model, predict and solve

# In[35]:

X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test = test_df.drop("PassengerId", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

# In[36]:

# Logistic Regression

logreg = LogisticRegression()
                           verbose=1)
grid_search = grid_search.fit(train_reduced, targets)
params = grid_search.best_params_

print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

# ###### Now we use the result of Best params as hyperparameters to train our final machine learning "model"

# In[ ]:

model = RandomForestClassifier(**params)

model.fit(train_reduced, targets)
print(compute_score(clf=model, X=train_reduced, y=targets, scoring='accuracy'))

# ### So we obtain a 82+% classification accuracy with our machine learning model. We will submit our prediction to check how good we did in our test data set. We will use the predictions made from Random Forest y_pred.

# In[ ]:

y_pred = model.predict(test_reduced).astype(int)

# In[ ]:

test_data = pd.read_csv('../input/test.csv')
submission = pd.DataFrame({
    "PassengerId": test_data["PassengerId"],
    "Survived": y_pred
})
submission.to_csv('titanic.csv', index=False)
예제 #25
0
            coef0=0.0,
            decision_function_shape=None,
            degree=3,
            gamma=1e-05,
            kernel='rbf',
            max_iter=-1,
            probability=True,
            random_state=None,
            shrinking=True,
            tol=0.001,
            verbose=False)  #max iter -> eğitebildiğin kadar eğit dedik -1 ile.

model.fit(X_train, y_train)
print "Eğitim bitti"
'''eğitim uzun sürüyor cache size vb parametreler sürede etkili'''
''' onceden yarattıgın pickle dosyasını cagırmak için;

from sklearn.externals import joblib

model = joblib.load('yeni_model.pkl')

model.predict(X_test[150,:])
'''

from sklearn.externals import joblib
joblib.dump(
    best_model, 'malware_model.pkl'
)  #pickle a cevirme yerden tasarruf ve tekrar yuklendiginde kolaylik

import pickle
예제 #26
0
                          name,
                          classes=sorted(list(set(ground_truth))),
                          normalize=True,
                          title='Normalized confusion matrix')

# In[7]:

#----- Feature importance ranking -----#

for name, model in [('RandomForest_recursive', RandomForestClassifier())]:
    print('Performing recursive feature elimination             : ', name)

    selector = RFECV(model, step=20, cv=10)
    selector = selector.fit(data_train, ground_truth)

    y_pred = selector.predict(data_train)
    cm = confusion_matrix(ground_truth,
                          y_pred,
                          labels=sorted(list(set(ground_truth))))

    #----- plot, print and save train results -----#
    plot_recall(name, cm, train_directory)
    plot_confusion_matrix(cm,
                          train_directory,
                          name,
                          classes=sorted(list(set(ground_truth))),
                          normalize=True,
                          title='Normalized confusion matrix')

    #----- save the model to disk -----#
    filename = os.path.join(train_directory, name) + '_model.sav'
예제 #27
0
    grid_search.fit(train, targets)
    model = grid_search
    parameters = grid_search.best_params_

    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))

else:
    parameters = {'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50,
                  'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6}

    model = RandomForestClassifier(**parameters)
    model.fit(train, targets)

output = model.predict(test).astype(int)
df_output = pd.DataFrame()
aux = pd.read_csv('test.csv')
df_output['PassengerId'] = aux['PassengerId']
df_output['Survived'] = output
df_output[['PassengerId', 'Survived']].to_csv('gridsearch_rf.csv', index=False)

trained_models = []
for model in models:
    model.fit(train, targets)
    trained_models.append(model)

predictions = []
for model in trained_models:
    predictions.append(model.predict_proba(test)[:, 1])
예제 #28
0
class Predictor:
    def __init__(self, data_directory='matches'):
        self._regressor = None
        self._model = None
        self._X_train = None
        self._X_test = None
        self._y_train = None
        self._y_test = None

        data = self._read_data(data_directory)
        self._create_features(data)
        self._create_regressor()
        self._train_model()

    @property
    def accuracy(self):
        predicted = self.predict(self._X_test, int)
        accuracy = round(accuracy_score(self._y_test, predicted) * 100.0, 2)
        print 'Accuracy: %s%%' % accuracy

    def print_tree(self):
        dot_data = StringIO()
        i = 1
        for tree_in_forest in self._model.estimators_:
            dot_data = tree.export_graphviz(
                tree_in_forest,
                out_file='tree_%s.dot' % str(i),
                feature_names=self._filtered_features,
                class_names=['away_win', 'home_win'],
                filled=True,
                rounded=True,
                special_characters=True)
            i += 1

    def simplify(self, test_data):
        test_data = test_data.loc[:,
                                  test_data.columns.isin(self.
                                                         _filtered_features)]
        test_data = test_data.reindex(self._filtered_features, axis=1)
        self._X_test = self._X_test.reindex(self._filtered_features, axis=1)
        parameters = {
            'bootstrap': False,
            'min_samples_leaf': 3,
            'n_estimators': 50,
            'min_samples_split': 10,
            'max_features': 'sqrt',
            'max_depth': 6
        }
        self._model = RandomForestRegressor(**parameters)
        self._model.fit(self._X_train, self._y_train)
        return test_data

    def predict(self, test_data, output_datatype):
        return self._model.predict(test_data).astype(output_datatype)

    def _read_data(self, data_directory):
        frames = [pd.read_pickle(match) for match in \
                  glob('%s/*/*' % data_directory)]
        data = pd.concat(frames)
        data.drop_duplicates(inplace=True)
        data = filter_stats(data)
        data = data.dropna()
        data['home_free_throw_percentage'].fillna(0, inplace=True)
        data['away_free_throw_percentage'].fillna(0, inplace=True)
        data['points_difference'] = data['home_points'] - data['away_points']
        return differential_vector(data)

    def _create_features(self, data):
        X = data.drop('away_points', 1)
        X = X.drop('home_points', 1)
        y = data[['home_points', 'away_points']].values
        split_data = train_test_split(X, y)
        self._X_train, self._X_test, self._y_train, self._y_test = split_data

    def _create_regressor(self):
        reg = RandomForestRegressor(n_estimators=50, max_features='sqrt')
        self._regressor = reg.fit(self._X_train, self._y_train)

    def _train_model(self):
        train = self._X_train
        self._model = SelectFromModel(self._regressor,
                                      prefit=True,
                                      threshold=0.01)
        self._X_train = self._model.transform(self._X_train)
        new_columns = train.columns[self._model.get_support()]
        self._filtered_features = [str(col) for col in new_columns]
예제 #29
0
#计算变量间的相关系数#
for i in range(len(feature)):
    selection.append([
        feature[i],
        corr.loc[:,
                 [feature[i]]][(np.abs(corr.loc[:, [feature[i]]].values) > 0.8)
                               & (corr.loc[:, [feature[i]]].values != 1)].index
    ])

##调参用sklearn的api##

###预测 ####
test = pd.read_csv("./data/crawler_test.txt", sep='\t')
test.head()
dtest = xgb.DMatrix(data=test.loc[:, feature].astype('float'))
preds = model.predict(dtest)
dfff = pd.concat([
    test.loc[:, ["clientid", "updatetime", "label", "score"]],
    pd.DataFrame(preds, columns=["pred"])
],
                 axis=1)
dfff
dfff.to_csv("crawler_0708prediction.csv", index=False, header=False)

###测试模型文件线上线下是否一致##
df_test = pd.DataFrame({
    "allianceid": ["na"],
    "avginterval2minutes": [799.4],
    "avginterval5minutes": [813.13336],
    "clientid": ["09031172210287250176"],
    "clientip2minutes": [1],
예제 #30
0
    model.compile(optimizer=optim,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # model.fit(x_ktrain, y_ktrain, batch_size=100, epochs=100, verbose=1)

    # y_kpred = np.argmax(model.predict(x_ktest), axis=1)

    # score = balanced_accuracy_score(y_ktest, y_kpred)
    # print(score)

    ############### model training
    y_clean = keras.utils.to_categorical(y_clean, 3)
    model.fit(x_clean, y_clean, batch_size=50, epochs=100, verbose=1)

    y_pred_mat[:, k] = np.argmax(model.predict(x_test_selected), axis=1)

y_pred = np.zeros(y_testid.shape[0])
for j in range(y_pred_mat.shape[0]):
    y_pred[j] = Counter(y_pred_mat[j]).most_common(1)[0][0]

print('vote_mat:', y_pred_mat[:5])
print('vote_result:', y_pred[:5])

# # ################ write output file
with open('output.csv', 'w') as f:
    f.write("{},{}\n".format("id", "y"))
    for i in range(len(y_testid)):
        f.write("{},{}\n".format(y_testid[i], y_pred[i]))