def kNN(train_data, train_labels, test): log_state('Use kNN classifier') clf = KNeighborsClassifier(n_neighbors=5) clf.fit(train_data, train_labels) predict_labels = clf.predict(test) dump_picle(predict_labels, './data/predict_labels/predict_labels.p') logger.info('kNN classifier training complete, saved predict labels to pickle') return
def logit(train_data, train_labels, test): log_state('Use logistic regression classifier') clf = linear_model.LogisticRegression(C=1e5) clf.fit(train_data, train_labels) predict_labels = clf.predict(test) dump_picle(predict_labels, './data/predict_labels/predict_labels.p') logger.info('MaxEnt classifier training complete, saved predict labels to pickle') return
def svm_classify(train_data, train_labels, test): log_state('Use SVM classifier') clf = svm.SVC(C=5.0, kernel='linear') clf.fit(train_data, train_labels) predict_labels = clf.predict(test) dump_picle(predict_labels, './data/predict_labels/predict_labels.p') logger.info('SVM classifier training complete, saved predict labels to pickle') return
def kNN(train_data, train_labels, test): log_state('Use kNN classifier') clf = KNeighborsClassifier(n_neighbors=5) clf.fit(train_data, train_labels) predict_labels = clf.predict(test) dump_picle(predict_labels, './data/predict_labels/predict_labels.p') logger.info( 'kNN classifier training complete, saved predict labels to pickle') return
def logit(train_data, train_labels, test): log_state('Use logistic regression classifier') clf = linear_model.LogisticRegression(C=1e5) clf.fit(train_data, train_labels) predict_labels = clf.predict(test) dump_picle(predict_labels, './data/predict_labels/predict_labels.p') logger.info( 'MaxEnt classifier training complete, saved predict labels to pickle') return
def svm_classify(train_data, train_labels, test): log_state('Use SVM classifier') clf = svm.SVC(C=5.0, kernel='linear') clf.fit(train_data, train_labels) predict_labels = clf.predict(test) dump_picle(predict_labels, './data/predict_labels/predict_labels.p') logger.info( 'SVM classifier training complete, saved predict labels to pickle') return
def gNB(train_data, train_labels, test, save_result=False): log_state('Use Gaussian Naive Bayes classifier') clf = GaussianNB() clf.fit(train_data, train_labels) predict_labels = clf.predict(test) predict_proba = clf.predict_proba(test) if save_result == True: dump_picle(predict_labels, './data/predict_labels/predict_labels.p') dump_picle(predict_proba, './data/predict_labels/predict_proba.p') logger.info('Classifier training complete, saved predict labels to pickle') return predict_labels
def TFIDF_estimator(): log_state('Start generating features') class StemmedTfidfVectorizer(TfidfVectorizer): def build_analyzer(self): # 利用NLTK进行词干化处理 english_stemmer = nltk.stem.SnowballStemmer('english') analyzer = super(TfidfVectorizer, self).build_analyzer() return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc)) vectorizer_param = {'preprocessor': preprocessor, 'ngram_range': parameters['ngram_range'], 'analyzer': 'word', 'min_df': parameters['min_df'], 'max_df': parameters['max_df'], 'binary': parameters['TF_binary'], 'norm': parameters['norm'], 'sublinear_tf': parameters['sublinear_tf'], 'max_features': parameters['max_features']} log_state((sorted(list(vectorizer_param.items())))) log_state('Training data size: ' + str(parameters['test_data_size'])) return StemmedTfidfVectorizer(**vectorizer_param)
def TFIDF_estimator(): log_state('Start generating features') class StemmedTfidfVectorizer(TfidfVectorizer): def build_analyzer(self): # 利用NLTK进行词干化处理 english_stemmer = nltk.stem.SnowballStemmer('english') analyzer = super(TfidfVectorizer, self).build_analyzer() return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc)) vectorizer_param = { 'preprocessor': preprocessor, 'ngram_range': parameters['ngram_range'], 'analyzer': 'word', 'min_df': parameters['min_df'], 'max_df': parameters['max_df'], 'binary': parameters['TF_binary'], 'norm': parameters['norm'], 'sublinear_tf': parameters['sublinear_tf'], 'max_features': parameters['max_features'] } log_state((sorted(list(vectorizer_param.items())))) log_state('Training data size: ' + str(parameters['test_data_size'])) return StemmedTfidfVectorizer(**vectorizer_param)
from vectorizers import punctuation_estimator from logger_manager import log_state from anew_vectorizer import strength_vectorizer, avg_affective_vectorizer vectorizer_param = {'preprocessor': preprocessor, 'ngram_range': parameters['ngram_range'], 'analyzer': 'word', 'min_df': parameters['min_df'], 'max_df': parameters['max_df'], 'binary': parameters['TF_binary'], 'norm': parameters['norm'], 'sublinear_tf': parameters['sublinear_tf'], 'max_features': parameters['max_features']} if __name__ == "__main__": unigram = StemmedTfidfVectorizer(**vectorizer_param) anew = anew_vectorizer() pct = punctuation_estimator() strength = strength_vectorizer() avg_strength = avg_affective_vectorizer() log_state('combine unigram and avg strength features') combined_features = FeatureUnion([('unigram', unigram), ('avg_strength', avg_strength)]) # log_state('combine unigram and strength features') # combined_features =FeatureUnion([('unigram',unigram),('strength',strength)]) # log_state('combine unigram and anew features') # combined_features =FeatureUnion([('unigram',unigram),('anew',anew)]) # log_state('combine unigram and punctuation features') # combined_features =FeatureUnion([('unigram',unigram),('pct',pct)]) texts, _ = load_train_data('Sentiment140') transformed_train = combined_features.fit_transform(texts) testdata, _ = load_test_data() transformed_test = combined_features.transform(testdata) dump_picle(combined_features.get_feature_names(), './data/features/feature_names.p')