def get_classifier(vocabulary): ''' 需要将抽象的句子分类到某一个模板,这里是训练分类器 ''' # 准备数据集 x_train = [] y_train = [] root = "./Qdata/question/" filenames = [ filename for filename in os.listdir(root) if filename[0] == "【" ] for filename in filenames: label = int(filename[filename.index("【") + 1:filename.index("】")]) with open(root + filename, "r", encoding="utf-8") as f: sen_list = [line.strip() for line in f.readlines()] x_train += sen_list y_train += [label] * len(sen_list) x_train_array = np.zeros((len(x_train), len(vocabulary))) for row, sentence in enumerate(x_train): for col, voc in enumerate(vocabulary): if voc in sentence: x_train_array[row, col] = 1 classifier = ComplementNB() classifier.fit(x_train_array, y_train) return classifier
def realizar_treinamento(registros_de_treino, vetorizador): treino_comentarios = [ registro_treino[0] for registro_treino in registros_de_treino ] treino_respostas = [ registro_treino[1] for registro_treino in registros_de_treino ] treino_comentarios = vetorizador.fit_transform(treino_comentarios) # modelo = BernoulliNB() # modelo = MultinomialNB() modelo = ComplementNB() modelo.fit(treino_comentarios, treino_respostas) # VALIDAÇÃO COM CROSS VALIDATION # cv = KFold(n_splits=200) # resultado = cross_val_predict(modelo, treino_comentarios, treino_respostas, cv=cv) # total = len(resultado) # acc = 0 # # score = accuracy_score(treino_respostas, resultado) # print(score * 100) # # for i in range(0, total): # if resultado[i] == treino_respostas[i]: # acc += 1 # # print(acc, total, acc/total * 100) # # print(metrics.classification_report(treino_respostas, resultado, [0, 1])) # # exit() return modelo
class CNBTwoStepClassifier(ImbalancedTrainerInterface): def __init__(self, alpha=1): self.alpha = alpha self.clf_yn = ComplementNB(alpha=alpha) self.clf_n = ComplementNB(alpha=alpha) def fit(self, X_train, y_train): X_train = X_train.toarray().tolist() y_train = pd.Series(y_train) max_class = self._find_dominant_class(X_train, y_train) x_w, x_o, y_w, y_o = self._partition(X_train, y_train, max_class[0]) x_yn = x_w + x_o y_yn = y_w + ['not'] * len(y_o) # print(y_yn) # print(y_o) self.clf_yn.fit(x_yn, y_yn) self.clf_n.fit(x_o, y_o) def predict(self, X_test): y_pred_yn = self.clf_yn.predict(X_test) y_pred_total = [] for p,x in list(zip(y_pred_yn, X_test)): if p == 'not': y_pred_total.append(self.clf_n.predict(x)[0]) else: y_pred_total.append(p) return y_pred_total def score(self, X_test, y_test): y_test = pd.Series(y_test) y_pred = self.predict(X_test) acc = np.mean(y_pred == y_test) return acc
def confusion_matrix(): ''' Creates a full confusion matrix for the top 15 varieties and displays it. Currently changes to vectorizer and model must be done manually. ''' wrangler = Data_Handler('data/cleaned_data.csv') df = wrangler.get_top_num(15) stops = wrangler.stop_words X = df['description'] y = df['variety'] X_train, X_test, y_train, y_test = train_test_split(X, y) vecto = TfidfVectorizer(stop_words=stops) X_train = vecto.fit_transform(X_train) X_test = vecto.transform(X_test) model = ComplementNB() model.fit(X_train, y_train) class_sort = [ 'Pinot Noir', 'Cabernet Sauvignon', 'Red Blend', 'Bordeaux-style Red Blend', 'Syrah', 'Merlot', 'Zinfandel', 'Sangiovese', 'Malbec', 'Nebbiolo', 'Rosé', 'Chardonnay', 'Sauvignon Blanc', 'Riesling', 'White Blend' ] plot_confusion_matrix(model, X_test, y_test, normalize='true', xticks_rotation='vertical', labels=class_sort, include_values=False) plt.show()
def get_optimal_values_ComplementNB(x_train, y_train, x_val, y_val): alphas = [x / 10 for x in range(0, 11)] fit_priors = [True, False] norms = [True, False] max_score = 0 optimal_fit_prior = True optimal_alpha = 1.0 optiomal_norm = False # Evaluamos para escoger el mejor parámetro for alpha in alphas: for fit_prior in fit_priors: for norm in norms: naive = ComplementNB(alpha=alpha, fit_prior=fit_prior, norm=norm) naive.fit(x_train, y_train) y_pred = naive.predict(x_val) if max_score < accuracy_score(y_val, y_pred) * 100: optimal_alpha = alpha optimal_fit_prior = fit_prior optiomal_norm = norm max_score = accuracy_score(y_val, y_pred) * 100 print(max_score, optimal_alpha, optimal_fit_prior, optiomal_norm) return max_score, optimal_alpha, optimal_fit_prior, optiomal_norm
def test_cnb(): # Tests ComplementNB when alpha=1.0 for the toy example in Manning, # Raghavan, and Schuetze's "Introduction to Information Retrieval" book: # http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html # Training data points are: # Chinese Beijing Chinese (class: China) # Chinese Chinese Shanghai (class: China) # Chinese Macao (class: China) # Tokyo Japan Chinese (class: Japan) # Features are Beijing, Chinese, Japan, Macao, Shanghai, and Tokyo. X = np.array([[1, 1, 0, 0, 0, 0], [0, 1, 0, 0, 1, 0], [0, 1, 0, 1, 0, 0], [0, 1, 1, 0, 0, 1]]) # Classes are China (0), Japan (1). Y = np.array([0, 0, 0, 1]) # Verify inputs are nonnegative. clf = ComplementNB(alpha=1.0) assert_raises(ValueError, clf.fit, -X, Y) clf.fit(X, Y) # Check that counts are correct. feature_count = np.array([[1, 3, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1]]) assert_array_equal(clf.feature_count_, feature_count) class_count = np.array([3, 1]) assert_array_equal(clf.class_count_, class_count) feature_all = np.array([1, 4, 1, 1, 1, 1]) assert_array_equal(clf.feature_all_, feature_all) # Check that weights are correct. See steps 4-6 in Table 4 of # Rennie et al. (2003). theta = np.array([ [ (0 + 1) / (3 + 6), (1 + 1) / (3 + 6), (1 + 1) / (3 + 6), (0 + 1) / (3 + 6), (0 + 1) / (3 + 6), (1 + 1) / (3 + 6) ], [ (1 + 1) / (6 + 6), (3 + 1) / (6 + 6), (0 + 1) / (6 + 6), (1 + 1) / (6 + 6), (1 + 1) / (6 + 6), (0 + 1) / (6 + 6) ]]) weights = np.zeros(theta.shape) for i in range(2): weights[i] = np.log(theta[i]) weights[i] /= weights[i].sum() assert_array_equal(clf.feature_log_prob_, weights)
def _complementnb(*, train, test, x_predict=None, metrics, alpha=1.0, fit_prior=True, class_prior=None, norm=False): """For for info visit : https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.ComplementNB.html#sklearn.naive_bayes.ComplementNB """ model = ComplementNB(alpha=alpha, fit_prior=fit_prior, class_prior=class_prior, norm=norm) model.fit(train[0], train[1]) model_name = 'ComplementNB' y_hat = model.predict(test[0]) if metrics == 'f1_score': accuracy = f1_score(test[1], y_hat) if metrics == 'jaccard_score': accuracy = jaccard_score(test[1], y_hat) if metrics == 'accuracy_score': accuracy = accuracy_score(test[1], y_hat) if x_predict is None: return (model_name, accuracy, None) y_predict = model.predict(x_predict) return (model_name, accuracy, y_predict)
def findBestFitCluster(orphanCorpus, corpusCluster={}): """ Given a set of questions without a cluster and a set of other clusters, find the best cluster to put the orphaned questions Parameters: orphanCorpus (tagged_question_corpus.TaggedQuestionCorpus): corpus of the questions without a cluster. corpusCluster ({tagged_question_corpus.TaggedQuestionCorpus}): Object containing different clusters and their corpuses Returns: xxx """ # corpusCluster = { # "questions": [ 'and the moon too guys', 'lets show some or a lot of love for the moon!!' ], # "question_vectors": [[], []], # "clusterIds": [ '4', '4' ] # } # orphanCorpus = [ { # "id": 11, "question": 'Another one about the sun?', "question_vector": [] # }, # { # "id": 33, # "question": 'What is the distance from the sun though?', "question_vector": [] }, # { # "id": 37, # "question": 'what\'s the changing factors of the sun and moon together?', "question_vector": [] # } ] # Fit the Naive bayes model on existing clusters clf = ComplementNB() clf.fit(corpusCluster["question_vectors"], corpusCluster["clusterIds"]) predictions = clf.predict_proba( [doc["question_vector"] for doc in orphanCorpus])
def ComplementNB_classification(train, test, train_labels, test_labels, res={}): """ :param train: training data, iterable/list :param test: testing data, iterable/list :param train_labels: training labels, iterable/list :param test_labels: testing labels, iterable/list :return: / --> Saves data in folder "Results" """ print("Classifying with Complement Nive Bayes...") complNB = ComplementNB() complNB.fit(train, train_labels) prediction = complNB.predict(test) utils.report_and_confmat(test_labels, prediction, "ComplementNB") score = complNB.score(test, test_labels) res["ComplementNB"] = { "model": complNB, "accuracy": score, "name": "ComplementNB" } print("Complement ended...") return score, complNB
class ComplementNBImpl(): def __init__(self, alpha=1.0, fit_prior=True, class_prior=None, norm=False): self._hyperparams = { 'alpha': alpha, 'fit_prior': fit_prior, 'class_prior': class_prior, 'norm': norm } def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X) def predict_proba(self, X): return self._sklearn_model.predict_proba(X)
def main(): # Iris or breast cancer dataset can be used too x, y = datasets.load_wine(return_X_y=True) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2405) # Multinomial Naive Bayes MNB = MultinomialNB() MNB.fit(x_train, y_train) mnb_accuracy = MNB.score(x_test, y_test) print(f"MultinomialNB accuracy is {round(mnb_accuracy, 4)}") # Gaussian Naive Bayes GNB = GaussianNB() GNB.fit(x_train, y_train) gnb_accuracy = GNB.score(x_test, y_test) print(f"GaussianNB accuracy is {round(gnb_accuracy, 4)}") # Complement Naive Bayes CNB = ComplementNB() CNB.fit(x_train, y_train) cnb_accuracy = CNB.score(x_test, y_test) print(f"ComplementNB accuracy is {round(cnb_accuracy, 4)}")
class bayes(object): def __init__(self, data, target, algorithm="GNB"): self.algorithm = algorithm self.data = data self.target = target if algorithm == 'GNB': self.model = GaussianNB() elif algorithm == 'MNB': self.model = MultinomialNB() elif algorithm == 'BNB': self.model = BernoulliNB() else: self.model = ComplementNB() self.model.fit(data, target) def save_model(self, path): _joblib.dump(self.model, path) def load_model(self, path): self.model = _joblib.load(path) def predict(self, x): res = self.model.predict(x) return res
def pickling(): ''' Creates and pickles both the vectorizer and model for use in prediction. Parameters ---------- None Returns ---------- None ''' wrangler = Data_Handler('data/cleaned_data.csv') stops = wrangler.stop_words df = wrangler.get_top_num(15) X = df['description'] y = df['variety'] vecto = TfidfVectorizer(stop_words=stops) X = vecto.fit_transform(df['description']) f = open('pickles/text_vec.pkl', 'wb') pickle.dump(vecto, f) model = ComplementNB() model.fit(X, y) m = open('pickles/model.pkl', 'wb') pickle.dump(model, m)
def main(args): model_name = args.model_name model_dir = os.path.join(args.root, "model") # get model dir data_dir = os.path.join(args.root, "data") # get data dir data_path = os.path.join(data_dir, args.inFile) print('load data from' + data_path) data = pickle.load(open(data_path, 'rb')) out_path = os.path.join(data_dir, args.outFileName + '.csv') assert 'data' in data if args.train: ratio = args.ratio clf = ComplementNB(alpha=args.alpha, fit_prior=args.fit_prior, norm=args.norm) assert 'target' in data features = data['data'] labels = data['target'] rs = ShuffleSplit(n_splits=1, test_size=ratio) train_index, val_index = next(rs.split(features, labels)) x_train = features[train_index] x_test = features[val_index] y_train = labels[train_index] y_test = labels[val_index] clf.fit(x_train, y_train) y_pred = clf.predict(x_test) # The accuracy print('Accuracy: \n', accuracy_score(y_test, y_pred)) df = pd.DataFrame({ 'pred': y_pred, 'target': y_test, }) print(f'validation results save to:{args.outFileName}.csv') df.to_csv(out_path) print("Some results of validation:") print(df.head()) model_path = os.path.join(model_dir, f'{model_name}_{model}.model') dump(clf, model_path) else: # TODO: How to Save the prediction? model_path = os.path.join(model_dir, args.model_path) clf = load(args.model) x = data['data'] pred = clf.predict(x) df = pd.DataFrame({ 'pred': pred, }) df.to_csv(out_path)
class NaiveBayes(): def __init__(self, division="sents", ngram=1): self.df_train = pd.read_csv(f"../data/{division}_train.csv", sep='\t', names=['sentence', 'author', 'work']) self.df_val = pd.read_csv(f"../data/{division}_val.csv", sep='\t', names=['sentence', 'author', 'work']) self.df_test = pd.read_csv(f"../data/{division}_test.csv", sep='\t', names=['sentence', 'author', 'work']) self.df_spurious = pd.read_csv(f"../data/{division}_spurious.csv", sep='\t', names=['sentence', 'work']) self.df_epistles = self.df_spurious[self.df_spurious['work'] == 36] self.df_spurious = self.df_spurious[self.df_spurious['work'] != 36] self.tfidf = TfidfVectorizer(lowercase=False, stop_words=list( map(strip_accents, STOPS_LIST)), ngram_range=(1, ngram)) self.tfidf_train = self.tfidf.fit_transform(self.df_train['sentence']) self.tfidf_val = self.tfidf.transform(self.df_val['sentence']) self.tfidf_test = self.tfidf.transform(self.df_test['sentence']) self.tfidf_spurious = self.tfidf.transform( self.df_spurious['sentence']) self.tfidf_epistles = self.tfidf.transform( self.df_epistles['sentence']) self.label = LabelEncoder() self.author_train = self.label.fit_transform(self.df_train['author']) self.author_val = self.label.transform(self.df_val['author']) self.author_test = self.label.transform(self.df_test['author']) self.nb = ComplementNB() self.nb.fit(self.tfidf_train, self.author_train) def eval(self): author_train_pred = self.nb.predict(self.tfidf_train) author_val_pred = self.nb.predict(self.tfidf_val) author_test_pred = self.nb.predict(self.tfidf_test) print(classification_report(self.author_train, author_train_pred)) print(classification_report(self.author_val, author_val_pred)) def predict(self): epistles_labels = self.label.inverse_transform( self.nb.predict(self.tfidf_epistles)) print((epistles_labels == "Plato").mean()) print(epistles_labels) spurious_labels = self.label.inverse_transform( self.nb.predict(self.tfidf_spurious)) print((spurious_labels == "Plato").mean())
def run_compnb(x_train, x_test, y_train, y_test, x): '''Complement Naive Bayes''' logger.info("Running ComplementNB") compnb = ComplementNB() compnb.fit(x_train, y_train) compnb_pred = compnb.predict(x_test) model_dict['compnb'] = get_model_results(compnb, x_test, y_test, compnb_pred, x) return compnb_pred
class DocClfTfidfCNB(): def __init__(self,maxStringLength=MAXSTRINGLENGH, \ firstStringLength=FIRSTSTRINGLENGTH): self.maxStringLength=maxStringLength self.firstStringLength=firstStringLength self.message="Complement Naive Bayes using TF-IDF with "+"%5d" % maxFeatures + " features " + \ " ngram-range "+"%2d" % ngramrange[0]+" to "+"%2d" % ngramrange[1] + \ " maxString Length "+ "%6d" % self.maxStringLength return def preprocess(self,x): xprocessed=[] xbegin=[] for item in x: xprocessed.append(item[0:self.maxStringLength]) xbegin.append(item[0:self.firstStringLength]) return xprocessed,xbegin def fit(self,x,y): # generate dictionary of words and numb of word occurences # in each document xprocessed,xbegin=self.preprocess(x) self.vectorizer=\ TfidfVectorizer(max_df=maxdf,min_df=mindf,max_features=maxFeatures, ngram_range=ngramrange) xv=self.vectorizer.fit_transform(xprocessed) self.nbclf=ComplementNB(alpha=alphasmooth) self.nbclf.fit(xv,y) ytrain=self.nbclf.predict(xv) return ytrain #predict for a group of x value def predict(self,x): if (len(x[0])<minLength): y=["No input"] return y try: xprocessed,xbegin=self.preprocess(x) xv=self.vectorizer.transform(xprocessed) y=self.nbclf.predict(xv) except: raise return y # Compute confidence given predicted values & return confusion matrix def confidence(self,ytest,ytestpred): conf_mat = confusion_matrix(ytest, ytestpred) # compute accuracy given predicted value labels = sorted(set(ytest)) self.confidence=dict(zip(labels, conf_mat.diagonal()/ (.1+conf_mat.sum(axis=0)))) return conf_mat # get the Confidence score for a single item: def getConfidence(self,x,y): try: return self.confidence[y] except: return -1.0;
def train_complement_naivebayes(params, x_train, y_train, n_folds, random_state, stratified=True, shuffle=True): # Model and hyperparameter selection if stratified: kf = StratifiedKFold(n_splits=n_folds, random_state=random_state, shuffle=shuffle) else: kf = KFold(n_splits=n_folds, random_state=random_state, shuffle=shuffle) cnb_model = ComplementNB(**params) i = 0 # Model Training for (train_index, test_index) in kf.split(x_train, y_train): # cross-validation randomly splits train data into train and validation data print('\n Fold %d' % (i + 1)) x_train_cv, x_val_cv = x_train.iloc[train_index], x_train.iloc[ test_index] y_train_cv, y_val_cv = y_train.iloc[train_index], y_train.iloc[ test_index] # declare your model cnb_model.fit(x_train_cv, y_train_cv) # predict train and validation set accuracy and get eval metrics scores_cv = cnb_model.predict(x_train_cv) scores_val = cnb_model.predict(x_val_cv) # training evaluation train_pc = accuracy_score(y_train_cv, scores_cv) train_pp = precision_score(y_train_cv, scores_cv) train_re = recall_score(y_train_cv, scores_cv) print('\n train-Accuracy: %.6f' % train_pc) print(' train-Precision: %.6f' % train_pp) print(' train-Recall: %.6f' % train_re) eval_pc = accuracy_score(y_val_cv, scores_val) eval_pp = precision_score(y_val_cv, scores_val) eval_re = recall_score(y_val_cv, scores_val) print('\n eval-Accuracy: %.6f' % eval_pc) print(' eval-Precision: %.6f' % eval_pp) print(' eval-Recall: %.6f' % eval_re) i = i + 1 # return model for evaluation and prediction return cnb_model
class Recommender(object): ''' A class to house the text vectorizer and stacked Naive Bayes/Random Forest Classifiers that form the heart of this wine recommender. ''' def __init__(self): self.nb = ComplementNB() self.rf = RandomForestClassifier() self.vecto = TfidfVectorizer() def _fit(self, data): ''' Takes in the data for the recommender to be trained and fit to. Parameters ---------- data - The filepath to the data being fit. Returns ---------- None ''' wrangler = Data_Handler(data) df = wrangler.get_top_num(15) X = df['description'] y = df['variety'] X = self.vecto.fit_transform(X) self.nb.fit(X, y) X = self.nb.predict_proba(X) self.rf.fit(X, y) def predict(self, text): ''' Takes in a single input of tasting notes and runs it through our vectorizer and ensemble method to return the top five predicted varieties. Parameters ---------- text - str - The input tastings notes. Returns ---------- top_five - lst - The top five predicted varieties for recommendation. ''' vect = self.vecto.transform([text]) probs = self.nb.predict_proba(vect) probs = self.rf.predict_proba(probs)[0] idx = np.argsort(probs) top_five_idx = idx[-1:-6:-1] top_five = self.rf.classes_[top_five_idx] return top_five
def initFitNaiveBayes(xtrain, ytrain): nb = ComplementNB( alpha=1.0, class_prior=None, fit_prior=True, norm=False ) nb.fit(xtrain, ytrain) print("Naive Bayes Training: Done") return nb
def CNB(train_x, train_y, test_x, test_y): #ComplementNB알고리즘 결과출력 cnb = ComplementNB() cnb.fit(train_x, train_y) pre_arr = cnb.predict(test_x) pre_arr = pre_arr.reshape(10, 12) print('ComplementNB의 테스트 세트 예측 :\n{}'.format(pre_arr)) print('ComplementNB의 테스트 세트 정확도 : {0:0.2f}%'.format( cnb.score(test_x, test_y) * 100)) print('------------------------------------------------------')
def complement_bayes(train_data, test_data): train_y = train_data['state'] train_X = train_data.iloc[:, FEATURES_INDICES] test_y = test_data['state'] test_X = test_data.iloc[:, FEATURES_INDICES] CNB = ComplementNB() CNB.fit(train_X, train_y) pred_y = CNB.predict(test_X) evaluate(CNB, test_X, test_y, pred_y)
def CNB_train(features, labels, ds): """ Use the Complement Naive Bayes classifier to train and saves the classifier as pickle file :param features: List of features from training set :param labels: List of labels from training set :param ds: Number of the dataset """ CNB_Classifier = ComplementNB(alpha=10.0) CNB_Classifier.fit(features, labels) save_classifier(CNB_Classifier, "ds" + ds + "CNB_Classifier.pkl")
def complementNB(tr_vec, tr_ans, val_vec, val_ans, te_vec): from sklearn.naive_bayes import ComplementNB clf = ComplementNB() clf.fit(tr_vec, tr_ans) print(clf.score(val_vec, val_ans)) print('make predictions ...') #clf_predictions = clf.predict_proba(te_vec) preds = clf.predict(te_vec) pred_test_y = (preds > 0.35).astype(int) return pred_test_y
def train_naive_bayes(list_of_vector_label_pairs: list, binary_classification=False, alpha=1.0, norm=False): """Builds and trains a Naive Bayes classifier on a list of (vector, label) tuples. Returns the Naive Bayes classifier.""" # Repackage the data for training the classifier list_of_vector_tuples, list_of_labels = split_data_pairs(list_of_vector_label_pairs, binary_classification) # Make and fit the classifier classifier = ComplementNB(alpha=1.0, norm=True) print("Please wait, training the Naive Bayes classifier now. . .") classifier.fit(list_of_vector_tuples, list_of_labels) print("Training complete.") return classifier
def get_accuracy_of_selection(X, y): # create k-fold cross validation object kf = StratifiedKFold(n_splits=25, shuffle=True, random_state=None) # array of accuracy predictions for this selection of features accuracies = [] # perform a k-fold cross validation to determine accuracy of selected features for train_index, test_index in kf.split(X, y): # split into testing and training data based on the splits X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # count each occurrence of the classes to determine frequency class_count = [0, 0] for i in y_train: class_count[int(i)] += 1 # calculate total number of observations and determine prior probability total = class_count[0] + class_count[1] prior_probability = [class_count[0] / total, class_count[1] / total] # define smoothing: "portion of the largest variance of all features that is added to variances for calculation stability." smoothing = 1e-09 # perform a complement naive bayes gnb = ComplementNB(class_prior=prior_probability) gnb.fit(X_train, y_train) y_pred = gnb.predict(X_test) # predicted class # y_probs = gnb.predict_proba(X_test) # confidence in each prediction # for i in range(y_pred.shape[0]): # if y_pred[i] != y_test[i]: # print(y_probs[i]) # shows that sometimes we are really really confident in the wrong answer # determine how accurate we were size = y_test.size true_count = (y_test == y_pred).sum() accuracy_percentage = (true_count) / size # add to array of accuracy predictions for this selection of features accuracies.append(accuracy_percentage) # compute the mean and standard deviation of this selection of features mean = np.mean(accuracies) sd = np.std(accuracies) # print("MEAN: " + str(round(mean*100,2)) + "%") # print("STANDARD DEVIATION: " + str(round(sd*100,2)) + "%") return mean, sd
class NBClassifier(super.abstract_classifier): def __init__(self, train_features, train_labels): self.train_features = train_features self.train_labels = train_labels self.nb_Member = ComplementNB() def train( self): # after this function the ComplementNB is ready to classify self.nb_Member.fit(self.train_features, self.train_labels) def classify(self, newVector): return self.nb_Member.predict(newVector)
def naive_bayes(x, y): # import complementNB,MultinomialNB cpl = ComplementNB() mnb = MultinomialNB() # train our dataset cpl.fit(x, y) mnb.fit(x, y) # perform prediction and find accuracy y_test_cpl = cpl.predict(x) y_test_mnb = mnb.predict(x) return y_test_cpl, y_test_mnb
def naive_bayes(self, name="Train_Test"): X_train, X_test, y_train, y_test = train_test_split(self.X, self.Y, test_size=0.4, random_state=0) clf = ComplementNB() clf.fit(X_train, y_train) predict = clf.predict(X_test) f, p, r = self.nbeval(y_test, predict) line = "{}: F score:{:.3f}\tP score:{:.3f}\tR score:{:.3f}.".format( name, f, p, r) self.logger.info(line)
def NB_accuracy_complement(X_train, X_test, y_train, y_test, fold): gnb = ComplementNB() gnb.fit(X_train, y_train) y_pred = gnb.predict(X_test) accuracy_score(y_test, y_pred) print(classification_report(y_test, y_pred)) print(confusion_matrix(y_test, y_pred)) print ("mean_squared_error: ", mean_squared_error(y_test, y_pred)) results = cross_val_score(gnb, X_train, y_train, cv = fold) print("After 5-fold: ", results.mean()*100)