num_cv=1, num_folds=10, scale='log') roc_aucs = pd.DataFrame(np.array(roc_aucs), index=['auc', 'std'], columns = map(lambda x: x['model'], params_ens)+['ENS']) roc_aucs.to_csv('data/submi/roc_aucs.csv', float_format='%11.6f') feature_set.to_csv('data/submi/features.csv') print "cross validation results:" print roc_aucs ############################################################################ # fit and predict ############################################################################ result = fb_funcs.fit_and_predict(info_humans, info_bots, info_test, params=params_ens, scale='log') y_test_proba = result['y_test_proba'] ytps = result['ytps'] # feature_importances = pd.DataFrame(np.array([result['features'], # result['importances']]).T) ############################################################################ # submission file generation ############################################################################ submissionfile = 'data/submi/sub_ens.csv' testfile = 'data/test.csv' print "writing a submission file..." write_submission(y_test_proba, info_test.index, testfile, submissionfile)
roc_auc = np.array(roc_auc) roc_auc_std = np.array(roc_auc_std) clf_score = np.array(clf_score) print "" print roc_auc.mean(), roc_auc_std.mean() print clf_score.mean(), clf_score.std() # print tpr_50 ############################################################################ # fit and predict ############################################################################ y_test_proba, y_train_proba, _\ = fit_and_predict(info_humans, info_bots, info_test, model='ET', n_estimators=1000, p_use=None, plotting=True) ############################################################################ # xgboost: CV ############################################################################ # y_pred, ytrain_pred, cv_result \ # = fit_and_predict(info_humans, info_bots, info_test, # n_estimators=20, p_use=None, cv=5) # auc = [] # for i in range(len(cv_result)): # auc.append(float(cv_result[i].split('\t')[1].split(':')[1].split('+')[0])) # best_itr = np.argmax(auc) # auc_std = float(cv_result[11].split('\t')[1].split('+')[1])
.union(dinfo_bots.keys())\ .union(dinfo_test.keys()) info_humans = append_device(info_humans, dinfo_humans, devices_appended) info_bots = append_device(info_bots, dinfo_bots, devices_appended) info_test = append_device(info_test, dinfo_test, devices_appended) info_humans.fillna(0, inplace=True) info_bots.fillna(0, inplace=True) info_test.fillna(0, inplace=True) info_humans.drop('merchandise', inplace=True, axis=1) info_bots.drop('merchandise', inplace=True, axis=1) info_test.drop('merchandise', inplace=True, axis=1) y_test_proba, y_train_proba, _, features\ = fit_and_predict(info_humans, info_bots, info_test, model='ET', params=params) device_importance = ['phone46', 'num_bids', 'phone143', 'num_ips', 'num_aucs', 'phone55', 'num_urls', 'phone63', 'phone2287', 'phone2330', 'phone239', 'phone110', 'phone3359', 'phone168', 'num_devices', 'phone22', 'num_countries', 'phone33', 'phone205', 'phone150', 'phone1026', 'phone728', 'phone136', 'phone25', 'phone224', 'phone640', 'phone1166', 'phone892', 'phone2955', 'phone1013', 'phone195', 'phone58', 'phone4479', 'phone469', 'phone90', 'phone15', 'phone996', 'phone5479', 'phone792', 'phone4']