示例#1
0
                             num_cv=1, num_folds=10, scale='log')

roc_aucs = pd.DataFrame(np.array(roc_aucs), index=['auc', 'std'],
                        columns = map(lambda x: x['model'], params_ens)+['ENS'])
roc_aucs.to_csv('data/submi/roc_aucs.csv', float_format='%11.6f')
feature_set.to_csv('data/submi/features.csv')

print "cross validation results:"
print roc_aucs

############################################################################
# fit and predict
############################################################################


result = fb_funcs.fit_and_predict(info_humans, info_bots, info_test,
                                  params=params_ens, scale='log')
y_test_proba = result['y_test_proba']
ytps = result['ytps']

# feature_importances = pd.DataFrame(np.array([result['features'],
                                             # result['importances']]).T)

############################################################################
# submission file generation
############################################################################
submissionfile = 'data/submi/sub_ens.csv'
testfile = 'data/test.csv'

print "writing a submission file..."
write_submission(y_test_proba, info_test.index, testfile, submissionfile)
示例#2
0
roc_auc = np.array(roc_auc)
roc_auc_std = np.array(roc_auc_std)
clf_score = np.array(clf_score)

print ""
print roc_auc.mean(), roc_auc_std.mean()
print clf_score.mean(), clf_score.std()
# print tpr_50


############################################################################
# fit and predict
############################################################################

y_test_proba, y_train_proba, _\
    = fit_and_predict(info_humans, info_bots, info_test, model='ET',
                      n_estimators=1000, p_use=None, plotting=True)

############################################################################
# xgboost: CV
############################################################################

# y_pred, ytrain_pred, cv_result \
#     = fit_and_predict(info_humans, info_bots, info_test,
#                       n_estimators=20, p_use=None, cv=5)

# auc = []
# for i in range(len(cv_result)):
#     auc.append(float(cv_result[i].split('\t')[1].split(':')[1].split('+')[0]))

# best_itr = np.argmax(auc)
# auc_std = float(cv_result[11].split('\t')[1].split('+')[1])
                                       .union(dinfo_bots.keys())\
                                       .union(dinfo_test.keys())
        info_humans = append_device(info_humans, dinfo_humans, devices_appended)
        info_bots = append_device(info_bots, dinfo_bots, devices_appended)
        info_test = append_device(info_test, dinfo_test, devices_appended)

        info_humans.fillna(0, inplace=True)
        info_bots.fillna(0, inplace=True)
        info_test.fillna(0, inplace=True)

        info_humans.drop('merchandise', inplace=True, axis=1)
        info_bots.drop('merchandise', inplace=True, axis=1)
        info_test.drop('merchandise', inplace=True, axis=1)

        y_test_proba, y_train_proba, _, features\
            = fit_and_predict(info_humans, info_bots, info_test, model='ET',
                              params=params)

        device_importance = ['phone46', 'num_bids', 'phone143',
                             'num_ips', 'num_aucs', 'phone55',
                             'num_urls', 'phone63', 'phone2287',
                             'phone2330', 'phone239', 'phone110',
                             'phone3359', 'phone168', 'num_devices',
                             'phone22', 'num_countries', 'phone33',
                             'phone205', 'phone150', 'phone1026',
                             'phone728', 'phone136', 'phone25',
                             'phone224', 'phone640', 'phone1166',
                             'phone892', 'phone2955', 'phone1013',
                             'phone195', 'phone58', 'phone4479',
                             'phone469', 'phone90', 'phone15',
                             'phone996', 'phone5479', 'phone792',
                             'phone4']