def Xgboost(): train_data, train_label = genKeyWords("../data/cnews.train.txt") vectors = CountVectorizer(max_df=0.2) tfidf = TfidfTransformer(use_idf=True) bst=XGBClassifier(n_jobs=10,max_depth=55,objective='multi:softmax',num_class=10,subsample=0.4,reg_lambda=0.8) pipline=Pipeline([("vectors",vectors),("tfidf",tfidf),("bst",bst)]) pipline.fit(train_data,train_label) joblib.dump(pipline,"./model/XGB.m") test_data, test_label = genKeyWords("../data/cnews.test.txt") predicted = pipline.predict(test_data) print('Xgboost', np.mean(predicted == test_label))
def svc(): count=CountVectorizer(max_df=0.2,max_features=None) tfidf=TfidfTransformer(use_idf=False) _svc=SVC(C=0.99,kernel='linear') train_data, train_label=genKeyWords("../data/cnews.train.txt") test_data,test_label=genKeyWords("../data/cnews.test.txt") pipline=Pipeline([("count",count),("tfidf",tfidf),("svc",_svc)]) pipline=pipline.fit(train_data,train_label) joblib.dump(pipline, "./SVM.m") predicted = pipline.predict(test_data) print('SVC', np.mean(predicted == test_label))
def Bayes(mode='mul'): if mode=='mul': model=MultinomialNB() elif mode=='gau': model=GaussianNB() elif mode=='bern': model=BernoulliNB() else: raise ValueError('没有该模式,请填写以下mode,\n mul==>MultinomialNB \ngau==>GaussianNB \n bern==>BernoulliNB') train_data, train_label = genKeyWords("../data/cnews.train.txt") vectors = CountVectorizer() tfidf=TfidfTransformer() pipline=Pipeline([("vectors",vectors),("tfidf",tfidf),("bayes",model)]) pipline.fit(train_data,train_label) test_data, test_label = genKeyWords("../data/cnews.test.txt") predicted = pipline.predict(test_data) joblib.dump(pipline, "./%s_bayes.m"%mode) print('naive_bayes', np.mean(predicted == test_label))
def Knn(): train_data, train_label=genKeyWords("../data/cnews.train.txt") vectors=CountVectorizer() tfidf=TfidfTransformer() clf=KNeighborsClassifier(n_neighbors=5,n_jobs=-1) pipline=Pipeline([("vectors",vectors),("tfidf",tfidf),("clf",clf)]) # parameters={'clf__n_neighbors':list(range(5,20,2))} # grid_search = GridSearchCV(pipline, parameters, n_jobs=-1, verbose=1) # grid_search.fit(train_data,train_label) # best_parameters = grid_search.best_estimator_.get_params() # for param_name in sorted(parameters.keys()): # print("\t%s: %r" % (param_name, best_parameters[param_name])) pipline.fit(train_data,train_label) test_data,test_label=genKeyWords("../data/cnews.test.txt") predicted = pipline.predict(test_data) joblib.dump(pipline,"./knn.m") print('KNeighborsClassifier', np.mean(predicted == test_label))
def DTrees(): train_data, train_label = genKeyWords("../data/cnews.train.txt") vectors = CountVectorizer(max_df=0.6) tfidf = TfidfTransformer(use_idf=False) tree=DecisionTreeClassifier(criterion="entropy",max_depth=20) pipline=Pipeline([("vectors",vectors),("tfidf",tfidf),("tree",tree)]) params={"tree__max_depth":list(range(75,105,5))} accuracy=make_scorer(accuracy_score) gridsearch=GridSearchCV(pipline,params,n_jobs=10,scoring=accuracy) gridsearch.fit(train_data,train_label) best_parameters = gridsearch.best_estimator_.get_params() for param_name in sorted(params.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name]))
def K_Means(minibatch): train_data, train_label = genKeyWords("../data/cnews.train.txt") vectors = CountVectorizer() tfidf = TfidfTransformer() if minibatch: km = MiniBatchKMeans(n_clusters=10, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=False) else: km = KMeans(n_clusters=10, init='k-means++', max_iter=300, n_init=1, verbose=False) pipline=Pipeline([("vectors",vectors),("tfidf",tfidf),("kmeans",km)]) pipline.fit(train_data)