Exemplo n.º 1
0
def gbdt(train_x, train_y, test_x, uid_cid):
    print 'start training...'
    clf = GradientBoostingClassifier(learning_rate=0.05,n_estimators=300,max_depth=7,max_features=0.6,min_samples_leaf=128,max_leaf_nodes=128,subsample=0.6,verbose=1)
    clf.fit_transform(train_x, train_y)
    print 'finish training...'
    joblib.dump(clf,'model/gbdt_model.m')
    print 'start predicting...'
    predict_y = clf.predict_proba(test_x)
    print clf.classes_
    print clf.feature_importances_ 
    print 'finish predicting...'
    return predict_y
def main():
    print "entering main..."

    path = '../Data/labeledRedditComments2.p'
    ppath = '../Data/cv.p'

    load_tstart = time.time()
    print 'loading data...'
    df = load_data(path)
    dfcv = pd.read_pickle(ppath)
    load_tstop = time.time()

    #take a subset of the data for testing this code
    # randNums = np.random.randint(low=0,high=len(df.index),size=(200,1))
    # rowList = [int(row) for row in randNums]
    # dfsmall = df.ix[rowList,:]

    nf = df

    #create training set and labels
    X = nf.body
    y = nf.label
    Xcv = dfcv['text'].values
    ycv = dfcv['label'].values

    # vect_tstart = time.time()
    # print "creating vectorizer..."
    # vect = TfidfVectorizer(stop_words='english', decode_error='ignore',
    #                        tokenizer=OHStokenize)
    #
    # print "vectorizing..."
    # # fit & transform comments matrix
    # tfidf_X = vect.fit_transform(X)
    #
    # print "pickling vectorizer..."
    # pickle.dump(vect, open('vect2.p', 'wb'))
    print 'vectorizing'
    # vect = pickle.load(open('vect2.p', 'rb'))
    # tfidf_X = vect.transform(X)
    # tfidf_Xcv = vect.transform(Xcv)
    # with open('../Data/doc_matrix.pickle','wb') as f:
    #     pickle.dump(tfidf_X,f)
    # vect_tstop = time.time()

    X = pickle.load(open('../Data/doc_matrix.pickle','rb'))
    print 'training model'
    bst = GradientBoostingClassifier(n_estimators = 150, learning_rate = 0.1)
    bst.fit_transform(X,y)
    print 'pickling'
    with open('../Data/boost4.model','wb') as f:
        pickle.dump(bst,f)
    #balance_trainning_set(X,y)
    y1 = 0
    y0 = 0
    for i in range(len(y)):
        if y[i] == 1:
            y1 += 1
        else:
            y0 += 1

    print "We got X for " + str(len(X)) + " and Y for " + str(len(y))
    print "we have " + str(y1) + "for 1 and " + str(y0) + " for 0"
    clf = GradientBoostingClassifier(n_estimators=47,
                                     learning_rate=0.03,
                                     max_depth=3,
                                     random_state=0)
    test_X = clf.fit_transform(X, y)
    #clf.fit(X,y)
    importances = clf.feature_importances_
    position_propotion = 0.0  # 0-8
    vertical_propotion = 0.0  # 9-74
    query_propotion = 0.0  #75-77
    text_propotion = 0.0  #78 - last
    #print "size of importances " + str(len(importances))

    #indices = np.argsort(importances)[::-1]
    #for f in range(10):
    #	print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

    #print len(test_X[0])
    #clf2 = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None)
    test = ['accuracy', 'recall_macro', 'f1_macro', 'roc_auc']
Exemplo n.º 4
0
gb = GradientBoostingClassifier()
gb.fit(x_train,y_train)
gb.score(x_test,y_test)
proba=pd.DataFrame(gb.predict_proba(x_test))[1]
false_positive_rate, true_positive_rate, thresholds = skrc(y_test,proba)
auc(false_positive_rate, true_positive_rate)

#find best features for Gradient Boost
#Feature selection based on AUC
X,X_test,y,y_test=train_test_split(X,y,train_size=.9)
model=GradientBoostingClassifier()
features=[]
scores=[]
for i in X:
    features.append(i)
    model.fit_transform(X[[i]],y)
    proba=model.predict_proba(X_test[[i]])
    proba=pd.DataFrame(proba)[1]
    false_positive_rate, true_positive_rate, thresholds = skrc(y_test,proba)
    scores.append(auc(false_positive_rate, true_positive_rate))
df_f=pd.DataFrame({'features':features, 'scores':scores})
df_f=df_f.sort_values(by='scores',ascending=False)
best=df_f.features


#Find best AUC
#build new train and test sets
train,test=train_test_split(df,train_size=.9)
y_train=train['2015h']
x_train=train.drop('2015h',axis=1)
y_test=test['2015h']
	X = write2X(aspects_1)[:1000]
	clear_trainning_set(X,y)
	#clear_trainning_set(X2,y2)
	#balance_trainning_set(X,y)
	y1 = 0 
	y0 = 0
	for i in range(len(y)):
		if y[i] == 1:
			y1 += 1
		else:
			y0 += 1

	print "We got X for " + str(len(X)) +" and Y for " + str(len(y))
	print "we have " + str(y1) + "for 1 and " + str(y0) + " for 0" 
	clf = GradientBoostingClassifier(n_estimators=47, learning_rate=0.03,max_depth=3,random_state=0)
	test_X = clf.fit_transform(X,y)
	#clf.fit(X,y)
	importances  = clf.feature_importances_
	position_propotion = 0.0 # 0-8
	vertical_propotion = 0.0 # 9-74
	query_propotion  =0.0 #75-77
	text_propotion = 0.0 #78 - last 
	#print "size of importances " + str(len(importances))

	#indices = np.argsort(importances)[::-1]
	#for f in range(10):
	#	print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
	
	#print len(test_X[0])
	#clf2 = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None)
	test = ['accuracy','recall_macro','f1_macro','roc_auc']