def naive_bayse_cross(train_x, train_y, validation, test, test_data): print("training data...") clf_pipe = make_pipeline(CountVectorizer(ngram_range=(1, 2)), RandomUnderSampler(), MultinomialNB(alpha=0.01)) scores = cross_val_score(clf_pipe, train_x, train_y, cv=10) print("Model is fitted!") if validation: print("scores: ", scores) print("std of score: ", np.std(scores)) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) y_pred = cross_val_predict(clf_pipe, train_x, train_y, cv=5) # Evaluation # classification report print("classification reports:", classification_report(train_y, y_pred)) # confusion matrix conf_mat = confusion_matrix(train_y, y_pred) print(conf_mat) plot_conf(conf_mat) if test: naive_bayes(test_data)
def test_custom_review(count_vec, train_vec, y_train_data): print('\nTest a custom review message') print('Enter review to be analysed: ', end=" ") test = [] test_list = [] test.append(input()) test_review = pd.DataFrame(data={"id": 1, "review": test}) print("Cleaning the test review") for i in range(0, len(test_review.review)): test_list.append(clean_review(test_review.review[i])) print("Vectorizing the test review") test_review_vec = count_vec.transform(test_list) print("Predicting") pred_naive_bayes = naive_bayes(train_vec, test_review_vec, y_train_data) if (pred_naive_bayes == 1): print("The review is predicted positive") else: print("The review is predicted negative")
def __init__(self, config): """This class is a wrapper for a model which targets either the Purpose or Field category set. """ self.target_set = config['target_set'] self.model_config = config['model_config'] self.model_name = config['model'] self.model = None # look for a non-default label set self.labelset = None if 'labels' in config.keys(): self.labelset = LabelTransformer(self.target_set, config['labels']) else: self.labelset = LabelTransformer( self.target_set, LabelTransformer.default_labels(self.target_set)) if self.target_set != 'purpose' and self.target_set != 'field': raise ValueError('Unknown target_set configuration value: %s \n' % config['target_set']) # sort out which model the configuration contains if self.model_name == 'LogisticRegression': self.model = LogisticRegression() elif self.model_name == 'DecisionTree': self.model = DecisionTreeClassifier() elif self.model_name == 'SVC': self.model = SVC() elif self.model_name == 'Naivebayes': self.model = naive_bayes() elif self.model_name == 'RandomForest': self.model = RandomForestClassifier() else: sys.exit('Invalid config, model given is unknown: %s' % self.model) self.model.set_params(**self.model_config) return
def train_model(classifier, train_path, test_path, type_classification, train=True, validation=True, test=True, cross_validation=False): # collect train data print("reading train set...") if type_classification == "T": # read titles and their label train_x, train_y = collect_titles(train_path) elif type_classification == "TB": # read whole document train_x, train_y = collect_documents(train_path) elif type_classification == "TBW": # weighted title and body train_x, train_y = collect_weighted_doc(train_path) else: print("wrong argument") # if test: print("loading test data...") test_data, reference = collect_test_documents(test_path) # split data if not cross_validation: print("spliting the train set...") train_data, validate_data, train_target, validate_target = train_test_split(train_x, train_y, test_size=0.4, random_state=0) # Naive bayes classifier if classifier == "NB": # train data set if train: print("training data...") naive_bayes_train(train_data, train_target) # validate validation set if validation: print("evaluating data...") naive_bayes_evaluate(validate_data, validate_target) # test data if test: print("testing data...") naive_bayes(test_data, reference) print("results are written in: \Results\Prediction.xlsx") # SVM classifier if classifier == "SVM": # train data set if train: print("training data...") svm_train(train_data, train_target) # validate validation set if validation: print("evaluating data...") svm_evaluate(validate_data, validate_target) # test data if test: print("testing data...") svm(test_data, reference) print("results are written in: \Results\Prediction.xlsx") # Logistic regression if classifier == "LR": # train data set if train: print("training data...") train_logistic_regression(train_data, train_target) # validate validation set if validation: print("evaluating data...") validate_logistic_regression(validate_data, validate_target) # test data if test: print("testing data...") logistic_regression(test_data, reference) print("results are written in: \Results\Prediction.xlsx") # using cross validation else: if classifier == "NB": naive_bayse_cross(train_x, train_y, validation, test, test_data) if classifier == "SVM": SVM_train_cross(train_x, train_y, validation, test, test_data)
print "SVM, N-Gram Vectors: ", accuracy def random_forrest(): #RF on Count Vectors accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count) print "RF, Count Vectors: ", accuracy # RF on Word Level TF IDF Vectors accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf) print "RF, WordLevel TF-IDF: ", accuracy while choice != "q": if choice == "1": naive_bayes() elif choice == "2": linear_classifier() elif choice == "3": svm() elif choice == "4": random_forrest() else: print("Invalid choice, please choose again") print("\n") choice = getChoice()
delimiter="\t", quoting=0) y_train_data = train_data.sentiment #Vectorization - TFIDF print("Using TFIDF ") train_vect, test_vec, count_vec = tfidf_vectorizer(train_list, test_list, train_data, test_data) #Dimensionality Reduction train_vec, test_vec = dimensionality_reduction(train_vect, test_vec, y_train_data) #Prediction pred_naive_bayes = naive_bayes(train_vec, test_vec, y_train_data) pred_random_forest = random_forest(train_vec, test_vec, y_train_data) pred_linear_svc = linear_svc(train_vec, test_vec, y_train_data) pred_logistic = logistic_regression(train_vec, test_vec, y_train_data) #Writing output of classifier with highest accuracy(Linear SVC)to csv output = pd.DataFrame( data={ "id": test_data.id, "review": test_data.review, "sentiment": pred_linear_svc }) output.to_csv("tfidf_svc.csv", index=False) print("Using pre-trained word2vec model") train_list = []