Пример #1
0
print('RandomForest on SMOTED Training Data:')
tic = time()
X_train = X_all[:y_train.shape[0]]
X_test = X_all[y_train.shape[0]:]
X_resampled, y_resampled = SMOTE().fit_sample(X_train, y_train)
toc = time()
print('SMOTE time = ', toc - tic)
## create temporary train and test sets to examine the goodness of fits

X_train_tmp, X_test_tmp, y_train_tmp, y_test_tmp = train_test_split(
    X_resampled, y_resampled, test_size=0.5, random_state=42)
#train and predict
importances, std, y_pred_tmp, exec_time = compute_feature_importances_RF(
    X_train_tmp, y_train_tmp, X_test_tmp)
# test
confusion_matrix = calculate_confusion_matrix(np.ravel(y_test_tmp),
                                              np.ravel(y_pred_tmp))
true_neg, false_pos, false_neg, true_pos = confusion_matrix[
    0, 0], confusion_matrix[0, 1], confusion_matrix[1, 0], confusion_matrix[1,
                                                                            1]
# TP = 573312, FP = 52579, FN = 197, TN = 520939
print(confusion_matrix)
print(classification_report(y_test_tmp, y_pred_tmp))
#select features
top_features = filter_for_top_features(
    X_train.columns, importances, threshold_imps=0.95
)  # threshold from 0.9 to 0.95same  68 features selected

##Try ADASYN and compare the top_features sets
#tic = time()
#X_resampled2, y_resampled2 = ADASYN().fit_sample(X_train, y_train)
#toc = time()
                            subsample=0.25,
                            colsample_bytree=0.2,
                            max_delta_step=0,
                            gamma=3,
                            objective='mlogloss',
                            reg_alpha=0.5,
                            missing=np.nan)

    # fitting model
    #     gbm.fit(x_train, y_train, eval_set=[(x_train,y_train),(x_test, y_test)], eval_metric='mlogloss', verbose=True)
    #     print gbm.evals_result()
    gbm.fit(x_train, y_train)

    # training performance
    y_pred = gbm.predict(x_train)
    conf_train, roc_auc_train = calculate_confusion_matrix(y_pred, y_train)

    # test
    y_pred = gbm.predict(x_test)
    conf, roc_auc = calculate_confusion_matrix(y_pred, y_test)

    labels.append(np.unique(y_test))
    confs.append(conf)
    aucs.append(roc_auc)

    print np.unique(y_test)
    print roc_auc_train, np.nanmean(roc_auc_train)
    print roc_auc, np.nanmean(roc_auc)

# saving the results
if save_results:
Пример #3
0
    #     conf_train, roc_auc_train = calculate_confusion_matrix(y_pred, y_train)

    # test (layer 1)
    y_pred1 = gbm1.predict(x_test1)

    # test (layer 2)
    y_pred1_code = pd.DataFrame(
        columns=['loc {}'.format(j) for j in range(len(location_top))])
    for j in range(x_test1.shape[0]):
        y_pred1_code.loc[j, :] = one_hot_encoder(y_pred1[j],
                                                 np.array(location_top))
    x_test2 = pd.concat([x_test1, y_pred1_code], axis=1)
    y_pred = gbm2.predict(x_test2)

    # test performance
    conf, roc_auc = calculate_confusion_matrix(y_pred, y_test)

    # foursquare performance
    #conf_fsq, roc_auc_fsq = calculate_confusion_matrix(state_fsq_all[i], y_test)

    labels.append(np.unique(y_test))
    confs.append(conf)
    aucs.append(roc_auc)
    #confs_fsq.append(conf_fsq)
    #aucs_fsq.append(roc_auc_fsq)

    #     print 'train'
    #     print np.unique(y_train)
    #     #print conf
    #     print np.nanmean(roc_auc_train)
labels = []
inds = np.arange(0, len(target_all), 1)

for i in range(n_boot):

    print "------------------"
    print i

    #     ind_boot = np.random.choice(inds, size=inds.size, replace=True)
    ind_boot = np.random.choice(inds, size=np.floor(inds.size * split), replace=False)

    y_report = pd.concat([target_all[j]["location"] for j in ind_boot], axis=0)
    y_fsq = pd.concat([target_all[j]["fsq"] for j in ind_boot], axis=0)

    # foursquare performance
    conf, roc_auc = calculate_confusion_matrix(y_fsq, y_report)

    labels.append(np.unique(y_report))
    confs.append(conf)
    aucs.append(roc_auc)

    print np.unique(y_report)
    print roc_auc, np.nanmean(roc_auc)

# saving the results
if save_results:
    with open("auc_location_new_10fold_fsq2.dat", "w") as f:
        pickle.dump([aucs, confs, labels], f)
    f.close()

Пример #5
0
                else:
                    feature_new = np.append(feature_new, [feature[i,:]], axis=0)
                state_new = np.append(state_new, state[i])
        state = state_new
        feature = feature_new
        
        #creating train and test sets
        x_train, y_train, x_test, y_test = split_balanced(feature, state, 0.5)

        #train
        gbm = xgboost.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(x_train, y_train)

        #test
        predictions = gbm.predict(x_test)

        conf, roc_auc = calculate_confusion_matrix(predictions, y_test)
        print np.unique(state)
        print roc_auc
        print '------------------'
    
        labels.append(np.unique(np.append(y_test, predictions)))
        confs.append(conf)
        aucs.append(roc_auc)

# saving the results
with open('accuracy_personal.dat','w') as f:
    pickle.dump([aucs, confs, labels], f)
f.close()


# In[35]:
    # remove foursquare data
    x_train = x_train.drop(['fsq 0','fsq 1','fsq 2','fsq 3','fsq 4','fsq 5','fsq 6','fsq 7','fsq 8','fsq distance'],axis=1)
    x_test = x_test.drop(['fsq 0','fsq 1','fsq 2','fsq 3','fsq 4','fsq 5','fsq 6','fsq 7','fsq 8','fsq distance'],axis=1)
    x_train = x_train.reset_index(drop=True)
    x_test = x_test.reset_index(drop=True)
    
    # train (layer 1)
    #eta_array = np.array([0.05]*200+[0.02]*200+[0.01]*200)
    gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05, nthread=12, subsample=0.1,                             colsample_bytree=0.5, max_delta_step=0, gamma=2, objective='multi:softmax', reg_alpha=0.9,                            missing=np.nan)
#     gbm.fit(x_train, y_train, eval_set=[(x_train,y_train),(x_test, y_test)], eval_metric='mlogloss', verbose=True)
#     print gbm.evals_result()
    gbm.fit(x_train, y_train)
    
    # training performance
    y_pred = gbm.predict(x_train)
    conf_train, roc_auc_train = calculate_confusion_matrix(y_pred, y_train)

    # test
    y_pred = gbm.predict(x_test)
    conf, roc_auc = calculate_confusion_matrix(y_pred, y_test)
    
    # foursquare performance
    #conf_fsq, roc_auc_fsq = calculate_confusion_matrix(state_fsq_all[i], y_test)
    
    labels.append(np.unique(y_test))
    confs.append(conf)
    aucs.append(roc_auc)

    print np.unique(y_test)
    print roc_auc_train, np.nanmean(roc_auc_train)
    print roc_auc, np.nanmean(roc_auc)
Пример #7
0
#     x_train = x_train.drop(['fsq 0','fsq 1','fsq 2','fsq 3','fsq 4','fsq 5','fsq 6','fsq 7'],axis=1)
#     x_test = x_test.drop(['fsq 0','fsq 1','fsq 2','fsq 3','fsq 4','fsq 5','fsq 6','fsq 7'],axis=1)
    
    # train (layer 1)
    #eta_list = np.array([0.05]*200+[0.02]*200+[0.01]*200)
    gbm = xgb.XGBClassifier(max_depth=3, n_estimators=100, learning_rate=0.01, nthread=12, subsample=1,                               max_delta_step=0).fit(x_train, y_train)
    
    # train performance
#     y_pred = gbm.predict(x_train)
#     conf_train, roc_auc_train = calculate_confusion_matrix(y_pred, y_train)

    # test (layer 1)
    y_pred = gbm.predict(x_test)
    
    # test performance
    conf, roc_auc = calculate_confusion_matrix(y_pred, y_test)
    
    # foursquare performance
    #conf_fsq, roc_auc_fsq = calculate_confusion_matrix(state_fsq_all[i], y_test)
    
    labels.append(np.unique(y_test))
    confs.append(conf)
    aucs.append(roc_auc)
    #confs_fsq.append(conf_fsq)
    #aucs_fsq.append(roc_auc_fsq)

#     print 'train'
#     print np.unique(y_train)
#     #print conf
#     print np.nanmean(roc_auc_train)
Пример #8
0
                state_new = np.append(state_new, state[i])
        state = state_new
        feature = feature_new

        #creating train and test sets
        x_train, y_train, x_test, y_test = split_balanced(feature, state, 0.5)

        #train
        gbm = xgboost.XGBClassifier(max_depth=3,
                                    n_estimators=300,
                                    learning_rate=0.05).fit(x_train, y_train)

        #test
        predictions = gbm.predict(x_test)

        conf, roc_auc = calculate_confusion_matrix(predictions, y_test)
        print np.unique(state)
        print roc_auc
        print '------------------'

        labels.append(np.unique(np.append(y_test, predictions)))
        confs.append(conf)
        aucs.append(roc_auc)

# saving the results
with open('accuracy_personal.dat', 'w') as f:
    pickle.dump([aucs, confs, labels], f)
f.close()

# In[35]:
#     # test set
#     x_test = feature.loc[inds[split:],:]
#     y_test = target.loc[inds[split:]]
#     x_test = x_test.reset_index(drop=True)
#     y_test = y_test.reset_index(drop=True)

    x_train, y_train, x_test, y_test = split_binary(feature, target, 0.9, oversample=True)

    # train (layer 1)
    #eta_list = np.array([0.05]*200+[0.02]*200+[0.01]*200)
    gbm = xgb.XGBClassifier(max_depth=20, n_estimators=400, learning_rate=0.001, nthread=12, subsample=.1,                            colsample_bytree=np.sqrt(feature.shape[1])/float(feature.shape[1]),                            colsample_bylevel=1, max_delta_step=20, seed=0, objective='binary:logistic').fit(x_train, y_train)
    # TODO: cgb.cv(///)
    
    # train performance
    y_pred = gbm.predict(x_train)
    conf, auc = calculate_confusion_matrix(y_pred, y_train)
    aucs_train[i] = auc[0]

    # test (layer 1)
    y_pred = gbm.predict(x_test)
    
    # test performance
    conf, auc = calculate_confusion_matrix(y_pred, y_test)
    aucs[i] = auc[0]
    confs.append(conf)
    

# conf, roc_auc = calculate_confusion_matrix(y_pred, np.array(target))

print
print 'Train AUC: {}'.format(np.nanmean(aucs_train))