def gbdt(train_x, train_y, test_x, uid_cid): print 'start training...' clf = GradientBoostingClassifier(learning_rate=0.05,n_estimators=300,max_depth=7,max_features=0.6,min_samples_leaf=128,max_leaf_nodes=128,subsample=0.6,verbose=1) clf.fit_transform(train_x, train_y) print 'finish training...' joblib.dump(clf,'model/gbdt_model.m') print 'start predicting...' predict_y = clf.predict_proba(test_x) print clf.classes_ print clf.feature_importances_ print 'finish predicting...' return predict_y
def main(): print "entering main..." path = '../Data/labeledRedditComments2.p' ppath = '../Data/cv.p' load_tstart = time.time() print 'loading data...' df = load_data(path) dfcv = pd.read_pickle(ppath) load_tstop = time.time() #take a subset of the data for testing this code # randNums = np.random.randint(low=0,high=len(df.index),size=(200,1)) # rowList = [int(row) for row in randNums] # dfsmall = df.ix[rowList,:] nf = df #create training set and labels X = nf.body y = nf.label Xcv = dfcv['text'].values ycv = dfcv['label'].values # vect_tstart = time.time() # print "creating vectorizer..." # vect = TfidfVectorizer(stop_words='english', decode_error='ignore', # tokenizer=OHStokenize) # # print "vectorizing..." # # fit & transform comments matrix # tfidf_X = vect.fit_transform(X) # # print "pickling vectorizer..." # pickle.dump(vect, open('vect2.p', 'wb')) print 'vectorizing' # vect = pickle.load(open('vect2.p', 'rb')) # tfidf_X = vect.transform(X) # tfidf_Xcv = vect.transform(Xcv) # with open('../Data/doc_matrix.pickle','wb') as f: # pickle.dump(tfidf_X,f) # vect_tstop = time.time() X = pickle.load(open('../Data/doc_matrix.pickle','rb')) print 'training model' bst = GradientBoostingClassifier(n_estimators = 150, learning_rate = 0.1) bst.fit_transform(X,y) print 'pickling' with open('../Data/boost4.model','wb') as f: pickle.dump(bst,f)
#balance_trainning_set(X,y) y1 = 0 y0 = 0 for i in range(len(y)): if y[i] == 1: y1 += 1 else: y0 += 1 print "We got X for " + str(len(X)) + " and Y for " + str(len(y)) print "we have " + str(y1) + "for 1 and " + str(y0) + " for 0" clf = GradientBoostingClassifier(n_estimators=47, learning_rate=0.03, max_depth=3, random_state=0) test_X = clf.fit_transform(X, y) #clf.fit(X,y) importances = clf.feature_importances_ position_propotion = 0.0 # 0-8 vertical_propotion = 0.0 # 9-74 query_propotion = 0.0 #75-77 text_propotion = 0.0 #78 - last #print "size of importances " + str(len(importances)) #indices = np.argsort(importances)[::-1] #for f in range(10): # print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) #print len(test_X[0]) #clf2 = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None) test = ['accuracy', 'recall_macro', 'f1_macro', 'roc_auc']
gb = GradientBoostingClassifier() gb.fit(x_train,y_train) gb.score(x_test,y_test) proba=pd.DataFrame(gb.predict_proba(x_test))[1] false_positive_rate, true_positive_rate, thresholds = skrc(y_test,proba) auc(false_positive_rate, true_positive_rate) #find best features for Gradient Boost #Feature selection based on AUC X,X_test,y,y_test=train_test_split(X,y,train_size=.9) model=GradientBoostingClassifier() features=[] scores=[] for i in X: features.append(i) model.fit_transform(X[[i]],y) proba=model.predict_proba(X_test[[i]]) proba=pd.DataFrame(proba)[1] false_positive_rate, true_positive_rate, thresholds = skrc(y_test,proba) scores.append(auc(false_positive_rate, true_positive_rate)) df_f=pd.DataFrame({'features':features, 'scores':scores}) df_f=df_f.sort_values(by='scores',ascending=False) best=df_f.features #Find best AUC #build new train and test sets train,test=train_test_split(df,train_size=.9) y_train=train['2015h'] x_train=train.drop('2015h',axis=1) y_test=test['2015h']
X = write2X(aspects_1)[:1000] clear_trainning_set(X,y) #clear_trainning_set(X2,y2) #balance_trainning_set(X,y) y1 = 0 y0 = 0 for i in range(len(y)): if y[i] == 1: y1 += 1 else: y0 += 1 print "We got X for " + str(len(X)) +" and Y for " + str(len(y)) print "we have " + str(y1) + "for 1 and " + str(y0) + " for 0" clf = GradientBoostingClassifier(n_estimators=47, learning_rate=0.03,max_depth=3,random_state=0) test_X = clf.fit_transform(X,y) #clf.fit(X,y) importances = clf.feature_importances_ position_propotion = 0.0 # 0-8 vertical_propotion = 0.0 # 9-74 query_propotion =0.0 #75-77 text_propotion = 0.0 #78 - last #print "size of importances " + str(len(importances)) #indices = np.argsort(importances)[::-1] #for f in range(10): # print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) #print len(test_X[0]) #clf2 = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None) test = ['accuracy','recall_macro','f1_macro','roc_auc']