def study(data): x = data.loc[:, data.columns != 'y'] y = data['y'] xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=480) xticks = np.arange(500, 600, 5) train_scores = [] test_scores = [] for i in xticks: model = XGBC(n_estimators=i, learning_rate=0.05) model.fit(xtrain, ytrain) ytrain_pred = model.predict(xtrain) ytest_pred = model.predict(xtest) train_score = accuracy_score(ytrain, ytrain_pred) test_score = accuracy_score(ytest, ytest_pred) train_scores.append(train_score) test_scores.append(test_score) # sorted_feature_importances = model.feature_importances_[np.argsort(-model.feature_importances_)] test_scores = np.array(test_scores, dtype='float32') sorted_test_scores = test_scores[np.argsort(-test_scores)] sorted_xtick = xticks[np.argsort(-test_scores)] print([*zip(sorted_test_scores, sorted_xtick)]) plt.plot(xticks, train_scores, label='train') plt.plot(xticks, test_scores, label='test') plt.legend() plt.show()
def xgbcv(num_round, subsample, eta, max_depth): val = cross_val_score( XGBC(num_round=int(num_round), subsample=float(subsample), eta=min(eta, 0.999), max_depth = int(max_depth), random_state=2 ), X, y, score , cv=kfold ).mean() return val
# In[]: X = pd.DataFrame(X, columns=["X1", "X2"]) y = pd.DataFrame(y, columns=["y"]) data = pd.concat([X, y], axis=1) # In[]: ft.Sample_imbalance(data, "y") # In[]: Xtrain, Xtest, Ytrain, Ytest = TTS(X, y, test_size=0.3, random_state=420) # In[]: ft.sample_category(Ytest, Ytrain) # In[]: # 在sklearn下建模# clf = XGBC().fit(Xtrain, Ytrain) ypred = clf.predict(Xtest) ypred_proba = clf.predict_proba(Xtest) # In[]: print(clf.score(Xtest, Ytest)) # 默认模型评估指标 - 准确率 print(cm(Ytest, ypred, labels=[1, 0])) # 少数类写在前面 print(recall(Ytest, ypred)) print(auc(Ytest, clf.predict_proba(Xtest)[:, 1])) # In[]: clf = XGBC(scale_pos_weight=10).fit(Xtrain, Ytrain) # 负:0/正:1 样本比例 ypred = clf.predict(Xtest) ypred_proba = clf.predict_proba(Xtest) # In[]: print(clf.score(Xtest, Ytest)) #默认模型评估指标 - 准确率 print(cm(Ytest, ypred, labels=[1, 0])) # 少数类写在前面
# split into train test sets using t_t_s # because we combined the datasets to apply uniform # one hot and label encoding, we set 'shuffle' parameter as false # we also know that there should be 15060 rows in the test sets test_set_size = test_dataset_nomissing.shape[0] print('\n test_set_size...') print(test_set_size) X_train, X_test, Y_train, Y_test = t_t_s(rescaledX, Y, test_size=test_set_size, random_state=seed, shuffle=False) # instantiate XGBC class using defaults model = XGBC() # fit model to training datasets print('\n training d model...') model.fit(X_train, Y_train) # view trained model print('\n model...') print(model) # make predictions for test data print('\n making predictions...') y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] train_predictions = model.score(X_train, Y_train)
def train(data): x = data.loc[:, data.columns != 'y'] y = data['y'] xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=100) model = XGBC(n_estimators=500, learning_rate=0.05, eval_metric='auc') model.fit(xtrain, ytrain) # train score: 0.900719 # test score: 0.893923 # train score: 0.920631 # test score: 0.899448 # train score: 0.927821 # test score: 0.903867 ytrain_pred = model.predict(xtrain) ytest_pred = model.predict(xtest) train_score = accuracy_score(ytrain, ytrain_pred) test_score = accuracy_score(ytest, ytest_pred) print('train score: %f \n test score: %f' % (train_score, test_score)) print('roc auc', roc_auc_score(ytrain, ytrain_pred)) sorted_feature_importances = model.feature_importances_[np.argsort( -model.feature_importances_)] feature_importance_names = x.columns[np.argsort( -model.feature_importances_)] print([*zip(feature_importance_names, sorted_feature_importances)]) fi = pd.DataFrame( [*zip(feature_importance_names, sorted_feature_importances)], columns=['name', 'score']) fi = fi.sort_values(by=['score'], ascending=True) fi = fi.reset_index(drop=True) ax = plt.gca() ax.hlines(y=fi.index, xmin=0, xmax=fi.score, color='firebrick', alpha=0.4, linewidth=30) for index, row in fi.iterrows(): plt.text(row['score'], index, round(row['score'], 2), horizontalalignment='left', verticalalignment='center', fontdict={ 'color': 'black', 'fontsize': 30 }) plt.yticks(fi.index, fi.name, fontsize=30) # ax.scatter(x=fi.index, y=fi.score, s=75, color='firebrick', alpha=0.7) plt.show() train_confusion_matrix = confusion_matrix(ytrain, ytrain_pred) test_confusion_matrix = confusion_matrix(ytest, ytest_pred) print('train confusion matrix:\n %s' % train_confusion_matrix) print('test confusion matrix:\n %s' % test_confusion_matrix) train_classification_report = classification_report(ytrain, ytrain_pred) test_classification_report = classification_report(ytest, ytest_pred) print('train classification report:\n %s' % train_classification_report) print('test classification repor:\n %s' % test_classification_report) return model, fi
def predict_sent(vecs, xgb_model_analyze): xgb = XGBC() xgb.load_model(xgb_model_analyze) pred = predict(vecs, w2v_model, xgb, 300) df = pd.DataFrame(pred, columns=['sent']) return df
find_ID(X_test[i].tolist(), X_all.tolist())) to_get_single_stats = False if to_get_single_stats: print 'Accuracy:', sum(res_pred == y_test) / float(len(y_test)) print clf.coef_[0] fig, ax = plt.subplots() plt.barh(range(len(y_ticklabels)), clf.coef_[0]) ax.set_yticklabels(y_ticklabels) plt.xlabel('coefs from linear SVM', fontsize=20) plt.tight_layout() plt.show() elif use_method == 'xgbc': from xgboost import XGBClassifier as XGBC from xgboost import plot_importance clf = XGBC() clf.fit(X_train, y_train) res_pred = clf.predict(X_test) #print res_pred #print y_test print 'Accuracy:', sum(res_pred == y_test) / float(len(y_test)) print(clf.feature_importances_) plot_importance(clf) ax = plt.gca() curr_labels = ax.get_yticklabels() curr_inds = [] for one_label in curr_labels: curr_label = one_label.get_text() one_ind = int(curr_label[1])
from xgboost import XGBClassifier as XGBC from sklearn.datasets import make_blobs from sklearn.model_selection import train_test_split as TTS from sklearn.metrics import confusion_matrix as cm, recall_score as recall, roc_auc_score as auc class_1 = 500 class_2 = 50 centers = [[0.0, 0.0], [2.0, 2.0]] clusters_std = [1.5, 0.5] X, y = make_blobs(n_samples=[class_1, class_2], centers=centers, cluster_std=clusters_std, random_state=0, shuffle=False) Xtrain, Xtest, Ytrain, Ytest = TTS(X, y, test_size=0.3, random_state=420) clf = XGBC().fit(Xtrain, Ytrain) ypred = clf.predict(Xtest) clf.score(Xtest, Ytest) cm(Ytest, ypred, labels=[1, 0]) recall(Ytest, ypred) auc(Ytest, clf.predict_proba(Xtest)[:, 1]) clf_ = XGBC(scale_pos_weight=10).fit(Xtrain, Ytrain) ypred_ = clf_.predict(Xtest) clf_.score(Xtest, Ytest) cm(Ytest, ypred_, labels=[1, 0]) recall(Ytest, ypred_) auc(Ytest, clf_.predict_proba(Xtest)[:, 1]) for i in [1, 5, 10, 20, 30]: clf_ = XGBC(scale_pos_weight=i).fit(Xtrain, Ytrain)
from sklearn.metrics import roc_auc_score, accuracy_score from xgboost import XGBClassifier as XGBC battles = pd.read_csv('data/battles.csv') character_predictions = pd.read_csv('data/character-predictions.csv') battle, character_pred = q01_feature_engineering(battles, character_predictions) death_preds = q08_preprocessing(character_pred) X = death_preds[death_preds.actual == 0].sample(350, random_state=62).append( death_preds[death_preds.actual == 1].sample( 350, random_state=62)).copy(deep=True).astype(np.float64) Y = X.actual.values tX = death_preds[~death_preds.index.isin(X.index)].copy(deep=True).astype( np.float64) tY = tX.actual.values X.drop(['SNo', 'actual', 'DateoFdeath'], 1, inplace=True) tX.drop(['SNo', 'actual', 'DateoFdeath'], 1, inplace=True) clf_xgb = XGBC(subsample=.8, colsample_bytree=.8, seed=14, max_depth=3) def q09_XGBoost(X_train, y_train, X_test, y_test, clf_xgb): 'write your solution here' model = clf_xgb model.fit(X_train, y_train) y_pred = model.predict(X_test) pred_prob = clf_xgb.predict_proba(tX) roc_auc = roc_auc_score(y_test, pred_prob[:, 1]) accuracy = accuracy_score(y_test, np.argmax(pred_prob, axis=1)) return roc_auc, accuracy
xtrain, xtest, ytrain, ytest = TTS(x, y, test_size=0.3, random_state=420) (y == 1).sum() / y.shape[0] # In[11]: x.shape # In[12]: y.shape # In[13]: clf = XGBC().fit(xtrain, ytrain) ypred = clf.predict(xtest) # In[14]: ypred # In[15]: cm(ytest, ypred, labels=[1, 0]) # In[16]: recall(ytest, ypred) # In[17]:
clf = RandomForestClassifier() print '_' * 20, clf.__class__.__name__, '_' * 20 print "Training the data" t0 = time() results_rf = test_classifier(clf, my_dataset, feature_list, folds) print("done in %0.3fs" % (time() - t0)) cm_rf = [[results_rf['true_negatives'], results_rf['false_negatives']], [results_rf['true_positives'], results_rf['false_positives']]] # In[44]: from xgboost import XGBClassifier as XGBC clf = XGBC() print '_' * 20, clf.__class__.__name__, '_' * 20 print "Training the data" t0 = time() results_xgb = test_classifier(clf, my_dataset, feature_list, folds) print("done in %0.3fs" % (time() - t0)) cm_xgb = [[results_xgb['true_negatives'], results_xgb['false_negatives']], [results_xgb['true_positives'], results_xgb['false_positives']]] # In[36]: from sklearn.linear_model import LogisticRegression
def optimCurveFit(strategy, method_clsf, ratio=0.8, NV_type='NVequals'): constrain_time = True ###################### #TODO Step 1: Data input ###################### data_set = 'mitdb' # 'ecgiddb', 'mitdb' channel = 0 records, IDs, fss, annss = mf.load_data( data_set, channel) #, num_persons=60, record_time=20) fs = fss[0] records = np.array(records) IDs = np.array(IDs) annss = np.array(annss) ###################### ###################### #TODO Step 2: Data selection ###################### if (strategy == 'allN_data') or (strategy == 'all_data'): '' # do nothing here elif strategy == 'NV_data': NV_inds = [6, 15, 18, 23, 24, 26, 29, 31, 33, 35, 39, 41, 42, 46] #for i in NV_inds: #range(annss.shape[0]): # # print i, Counter(annss[i][1])['V'] records = records[NV_inds, :] IDs = IDs[NV_inds] annss = annss[NV_inds, :] ## re-numbering the IDs... wtf for i in range(len(NV_inds)): IDs[i] = i elif strategy == 'combine_IDs': num_to_combine = 4 print IDs for i in range(int(len(records) / num_to_combine)): for j in range(num_to_combine - 1): IDs[i * num_to_combine + j + 1] = IDs[i * num_to_combine + j] #IDs[i*2+1] = IDs[i*2] for i in range(len(IDs)): IDs[i] /= num_to_combine if constrain_time: look_time = 600. # in s look_ind = int(look_time * fs) records = records[:, :look_ind] annss = annss[:, :look_ind] recs = [] for i in range(len(records)): curr_rec = Rec(records[i], fs, IDs[i], annss[i]) recs.append(curr_rec) ###################### ###################### #TODO Step 3: Data filtering ###################### ###################### ###################### #TODO Step 4: Data segmentation ###################### USE_BIOSPPY_FILTERED = True sigs, labels_bySegs = mf.get_seg_data(records, IDs, fss, USE_BIOSPPY_FILTERED, annss=annss) sigs, labels_bySegs = np.array(sigs), np.array(labels_bySegs) mrks_bySegs = np.array([x[-1] for x in labels_bySegs]) if strategy == 'allN_data': N_masks = (mrks_bySegs == 'N') sigs = sigs[N_masks, :] labels_bySegs = labels_bySegs[N_masks] IDs_bySegs = [int(x[:-1]) for x in labels_bySegs] mrks_bySegs = [x[-1] for x in labels_bySegs] IDs_bySegs, mrks_bySegs = np.array(IDs_bySegs), np.array(mrks_bySegs) segs = [] for i in range(len(sigs)): curr_seg = Seg(sig=sigs[i], fs=fs, ID=IDs_bySegs[i], mrk=mrks_bySegs[i]) segs.append(curr_seg) segs = np.array(segs) ###################### #for one_label in labels_all: # if ('N' in one_label) or ('V' in one_label): # print one_label #quit() #segs_all, labels_all = np.array(segs_all), np.array(labels_all) ###################### #TODO Step 5: feature extraction ###################### X_all = [] y_all = [] method_feat = 'PCA' # 'template_matching' if method_feat == 'PCA': feat_dim = 20 pca = PCA(n_components=feat_dim) X_all = np.array([x.sig for x in segs]) X_all = pca.fit(X_all).transform(X_all) for i in range(len(segs)): segs[i].feat = X_all[i, :] y_all = np.array([x.ID for x in segs]) X_all = np.array(X_all) ###################### ###################### #TODO Step 6: Data split ###################### if strategy != 'NV_data': X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42) else: X_train, X_test, y_train, y_test = [], [], [], [] y_test_mrks = [] for i in range(len(NV_inds)): curr_mrks = mrks_bySegs[IDs_bySegs == i] #current people's mrks\ #print curr_mrks curr_segs = segs[IDs_bySegs == i] curr_labels = labels_bySegs[IDs_bySegs == i] curr_inds_Vs = np.where(curr_mrks == 'V')[0] curr_inds_Ns = np.where(curr_mrks == 'N')[0] curr_num_Vs = sum(np.array(curr_mrks) == 'V') #all his Vs curr_num_Ns = sum(np.array(curr_mrks) == 'N') if NV_type == 'fixV': train_num_Vs = int(curr_num_Vs * .8) train_num_Ns = min( [int(curr_num_Ns * .8), int(ratio * train_num_Vs)]) elif NV_type == 'NVequals': train_num_Vs = int(curr_num_Vs * ratio) train_num_Ns = train_num_Vs train_inds_Vs = random.sample(curr_inds_Vs, train_num_Vs) test_inds_Vs = [ x for x in curr_inds_Vs if not (x in train_inds_Vs) ] #test_inds_Vs = curr_inds_Vs[~ train_inds_Vs] train_inds_Ns = random.sample(curr_inds_Ns, train_num_Ns) test_inds_Ns = [ x for x in curr_inds_Ns if not (x in train_inds_Ns) ] #print len(train_inds_Vs), len(test_inds_Vs) #print len(train_inds_Ns), len(test_inds_Ns) #test_inds_Ns = curr_inds_Vs[~ train_inds_Ns] # print train_inds_Ns # print test_inds_Ns curr_IDs = IDs_bySegs[IDs_bySegs == i] #print curr_IDs for one_seg in curr_segs[train_inds_Vs]: X_train.append(one_seg.feat.tolist()) for one_lab in curr_IDs[train_inds_Vs]: y_train.append(one_lab) for one_seg in curr_segs[train_inds_Ns]: X_train.append(one_seg.feat.tolist()) for one_lab in curr_IDs[train_inds_Ns]: y_train.append(one_lab) for one_seg in curr_segs[test_inds_Vs]: X_test.append(one_seg.feat.tolist()) for one_lab in curr_IDs[test_inds_Vs]: y_test.append(one_lab) for one_mrk in curr_mrks[test_inds_Vs]: y_test_mrks.append(one_mrk) for one_seg in curr_segs[test_inds_Ns]: X_test.append(one_seg.feat.tolist()) for one_lab in curr_IDs[test_inds_Ns]: y_test.append(one_lab) for one_mrk in curr_mrks[test_inds_Ns]: y_test_mrks.append(one_mrk) #print i #print len(X_train), len(y_train), len(X_test), len(y_test) X_train, y_train, X_test, y_test = \ np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test) ###################### #print X_train.shape, y_train.shape, X_test.shape, y_test.shape #quit() #print X_train #print X_test #y_train = [int(y[:-1]) for y in y_train] #y_test = [int(y[:-1]) for y in y_test] ###################### #TODO Step 7: Model training ###################### time_before_training = Time() if method_clsf == 'SVM': not_trained = True from sklearn.externals import joblib if not_trained: clf = svm.SVC(kernel='rbf', C=10., gamma=0.1) clf.fit(X_train, y_train) joblib.dump(clf, 'test_clf.pkl') else: clf = joblib.load('test_clf.pkl') res_pred = clf.predict(X_test) elif method_clsf == 'Logit': clf = LR(C=10.) clf.fit(X_train, y_train) res_pred = clf.predict(X_test) elif method_clsf == 'kNN': clf = KNC() clf.fit(X_train, y_train) res_pred = clf.predict(X_test) elif method_clsf == 'DTC': clf = DTC() clf.fit(X_train, y_train) res_pred = clf.predict(X_test) elif method_clsf == 'boosting': clf = XGBC() clf.fit(X_train, y_train) res_pred = clf.predict(X_test) elif method_clsf == 'GNB': clf = GNB() clf.fit(X_train, y_train) res_pred = clf.predict(X_test) elif method_clsf == 'DL': not_trained = True from sklearn.externals import joblib if not_trained: model = Sequential() model.add( Dense(feat_dim, activation='relu', input_shape=(feat_dim, ))) #model.add(Dense(input_dim,activation='relu')) num_categs = len(set(y_train)) print y_train, num_categs Y_train = np_utils.to_categorical(y_train, num_categs) Y_test = np_utils.to_categorical(y_test, num_categs) model.add(Dense(num_categs, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) X_train = np.array(X_train) Y_train = np.array(Y_train) #print X_train.shape #print Y_train.shape model.fit(X_train, Y_train, validation_split=0.2, batch_size=32, nb_epoch=50, verbose=0) #model.save('test_clf_DL.pkl') else: model = keras.models.load_model('test_clf_DL.pkl') #score = model.evaluate(X_test, Y_test, verbose=0) time_after_training = Time() ###################### #TODO Step 8: Model testing ###################### if method_clsf != 'DL': res_pred = clf.predict(X_test) else: res_pred = model.predict_classes(X_test) ###################### ###################### #TODO Step 9: Result output ###################### train_time = time_after_training - time_before_training print_res = False if print_res: print '' print 'Parameters:' print 'strategy:', strategy print 'constrain_time:', constrain_time print 'ratio:', ratio print 'method_clsf:', method_clsf #print '' print 'Results:' print 'Used time for training:', time_after_training - time_before_training res_look = [] for i in range(len(res_pred)): res_look.append((res_pred[i], y_test[i])) #print res_look if False: res_pred_IDs = np.array([y[:-1] for y in res_pred]) res_pred_mrks = np.array([y[-1] for y in res_pred]) only_test_ID = True if only_test_ID: to_be_predct = res_pred_IDs to_be_tested = y_test else: to_be_predct = res_pred to_be_tested = y_test ##TODO: adjust accordingly if strategy == 'NV_data': look_stat = 'V' y_test_mrks = np.array(y_test_mrks) #print y_test_mrks to_be_predct = res_pred[y_test_mrks == look_stat] to_be_tested = y_test[y_test_mrks == look_stat] res_by_seg = mf.get_corr_ratio(res_pred=to_be_predct, y_test=to_be_tested, type='by_seg') res_by_categ = mf.get_corr_ratio(res_pred=to_be_predct, y_test=to_be_tested, type='by_categ') one_res = (float(format(res_by_seg, '.3f')), float(format(res_by_categ, '.3f'))) accuBySeg_V = one_res[0] #print len(to_be_predct), one_res look_stat = 'N' to_be_predct = res_pred[y_test_mrks == look_stat] to_be_tested = y_test[y_test_mrks == look_stat] res_by_seg = mf.get_corr_ratio(res_pred=to_be_predct, y_test=to_be_tested, type='by_seg') res_by_categ = mf.get_corr_ratio(res_pred=to_be_predct, y_test=to_be_tested, type='by_categ') one_res = (float(format(res_by_seg, '.3f')), float(format(res_by_categ, '.3f'))) accuBySeg_N = one_res[0] #print len(to_be_predct), one_res return [accuBySeg_V, accuBySeg_N, train_time] else: to_be_predct = res_pred to_be_tested = y_test res_by_seg = mf.get_corr_ratio(res_pred=to_be_predct, y_test=to_be_tested, type='by_seg') res_by_categ = mf.get_corr_ratio(res_pred=to_be_predct, y_test=to_be_tested, type='by_categ') one_res = (float(format(res_by_seg, '.3f')), float(format(res_by_categ, '.3f'))) return [one_res[0], train_time]