def _complementnb(*, train, test, x_predict=None, metrics, alpha=1.0, fit_prior=True, class_prior=None, norm=False): """For for info visit : https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.ComplementNB.html#sklearn.naive_bayes.ComplementNB """ model = ComplementNB(alpha=alpha, fit_prior=fit_prior, class_prior=class_prior, norm=norm) model.fit(train[0], train[1]) model_name = 'ComplementNB' y_hat = model.predict(test[0]) if metrics == 'f1_score': accuracy = f1_score(test[1], y_hat) if metrics == 'jaccard_score': accuracy = jaccard_score(test[1], y_hat) if metrics == 'accuracy_score': accuracy = accuracy_score(test[1], y_hat) if x_predict is None: return (model_name, accuracy, None) y_predict = model.predict(x_predict) return (model_name, accuracy, y_predict)
class CNBTwoStepClassifier(ImbalancedTrainerInterface): def __init__(self, alpha=1): self.alpha = alpha self.clf_yn = ComplementNB(alpha=alpha) self.clf_n = ComplementNB(alpha=alpha) def fit(self, X_train, y_train): X_train = X_train.toarray().tolist() y_train = pd.Series(y_train) max_class = self._find_dominant_class(X_train, y_train) x_w, x_o, y_w, y_o = self._partition(X_train, y_train, max_class[0]) x_yn = x_w + x_o y_yn = y_w + ['not'] * len(y_o) # print(y_yn) # print(y_o) self.clf_yn.fit(x_yn, y_yn) self.clf_n.fit(x_o, y_o) def predict(self, X_test): y_pred_yn = self.clf_yn.predict(X_test) y_pred_total = [] for p,x in list(zip(y_pred_yn, X_test)): if p == 'not': y_pred_total.append(self.clf_n.predict(x)[0]) else: y_pred_total.append(p) return y_pred_total def score(self, X_test, y_test): y_test = pd.Series(y_test) y_pred = self.predict(X_test) acc = np.mean(y_pred == y_test) return acc
def main(args): model_name = args.model_name model_dir = os.path.join(args.root, "model") # get model dir data_dir = os.path.join(args.root, "data") # get data dir data_path = os.path.join(data_dir, args.inFile) print('load data from' + data_path) data = pickle.load(open(data_path, 'rb')) out_path = os.path.join(data_dir, args.outFileName + '.csv') assert 'data' in data if args.train: ratio = args.ratio clf = ComplementNB(alpha=args.alpha, fit_prior=args.fit_prior, norm=args.norm) assert 'target' in data features = data['data'] labels = data['target'] rs = ShuffleSplit(n_splits=1, test_size=ratio) train_index, val_index = next(rs.split(features, labels)) x_train = features[train_index] x_test = features[val_index] y_train = labels[train_index] y_test = labels[val_index] clf.fit(x_train, y_train) y_pred = clf.predict(x_test) # The accuracy print('Accuracy: \n', accuracy_score(y_test, y_pred)) df = pd.DataFrame({ 'pred': y_pred, 'target': y_test, }) print(f'validation results save to:{args.outFileName}.csv') df.to_csv(out_path) print("Some results of validation:") print(df.head()) model_path = os.path.join(model_dir, f'{model_name}_{model}.model') dump(clf, model_path) else: # TODO: How to Save the prediction? model_path = os.path.join(model_dir, args.model_path) clf = load(args.model) x = data['data'] pred = clf.predict(x) df = pd.DataFrame({ 'pred': pred, }) df.to_csv(out_path)
class NaiveBayes(): def __init__(self, division="sents", ngram=1): self.df_train = pd.read_csv(f"../data/{division}_train.csv", sep='\t', names=['sentence', 'author', 'work']) self.df_val = pd.read_csv(f"../data/{division}_val.csv", sep='\t', names=['sentence', 'author', 'work']) self.df_test = pd.read_csv(f"../data/{division}_test.csv", sep='\t', names=['sentence', 'author', 'work']) self.df_spurious = pd.read_csv(f"../data/{division}_spurious.csv", sep='\t', names=['sentence', 'work']) self.df_epistles = self.df_spurious[self.df_spurious['work'] == 36] self.df_spurious = self.df_spurious[self.df_spurious['work'] != 36] self.tfidf = TfidfVectorizer(lowercase=False, stop_words=list( map(strip_accents, STOPS_LIST)), ngram_range=(1, ngram)) self.tfidf_train = self.tfidf.fit_transform(self.df_train['sentence']) self.tfidf_val = self.tfidf.transform(self.df_val['sentence']) self.tfidf_test = self.tfidf.transform(self.df_test['sentence']) self.tfidf_spurious = self.tfidf.transform( self.df_spurious['sentence']) self.tfidf_epistles = self.tfidf.transform( self.df_epistles['sentence']) self.label = LabelEncoder() self.author_train = self.label.fit_transform(self.df_train['author']) self.author_val = self.label.transform(self.df_val['author']) self.author_test = self.label.transform(self.df_test['author']) self.nb = ComplementNB() self.nb.fit(self.tfidf_train, self.author_train) def eval(self): author_train_pred = self.nb.predict(self.tfidf_train) author_val_pred = self.nb.predict(self.tfidf_val) author_test_pred = self.nb.predict(self.tfidf_test) print(classification_report(self.author_train, author_train_pred)) print(classification_report(self.author_val, author_val_pred)) def predict(self): epistles_labels = self.label.inverse_transform( self.nb.predict(self.tfidf_epistles)) print((epistles_labels == "Plato").mean()) print(epistles_labels) spurious_labels = self.label.inverse_transform( self.nb.predict(self.tfidf_spurious)) print((spurious_labels == "Plato").mean())
def train_complement_naivebayes(params, x_train, y_train, n_folds, random_state, stratified=True, shuffle=True): # Model and hyperparameter selection if stratified: kf = StratifiedKFold(n_splits=n_folds, random_state=random_state, shuffle=shuffle) else: kf = KFold(n_splits=n_folds, random_state=random_state, shuffle=shuffle) cnb_model = ComplementNB(**params) i = 0 # Model Training for (train_index, test_index) in kf.split(x_train, y_train): # cross-validation randomly splits train data into train and validation data print('\n Fold %d' % (i + 1)) x_train_cv, x_val_cv = x_train.iloc[train_index], x_train.iloc[ test_index] y_train_cv, y_val_cv = y_train.iloc[train_index], y_train.iloc[ test_index] # declare your model cnb_model.fit(x_train_cv, y_train_cv) # predict train and validation set accuracy and get eval metrics scores_cv = cnb_model.predict(x_train_cv) scores_val = cnb_model.predict(x_val_cv) # training evaluation train_pc = accuracy_score(y_train_cv, scores_cv) train_pp = precision_score(y_train_cv, scores_cv) train_re = recall_score(y_train_cv, scores_cv) print('\n train-Accuracy: %.6f' % train_pc) print(' train-Precision: %.6f' % train_pp) print(' train-Recall: %.6f' % train_re) eval_pc = accuracy_score(y_val_cv, scores_val) eval_pp = precision_score(y_val_cv, scores_val) eval_re = recall_score(y_val_cv, scores_val) print('\n eval-Accuracy: %.6f' % eval_pc) print(' eval-Precision: %.6f' % eval_pp) print(' eval-Recall: %.6f' % eval_re) i = i + 1 # return model for evaluation and prediction return cnb_model
class DocClfTfidfCNB(): def __init__(self,maxStringLength=MAXSTRINGLENGH, \ firstStringLength=FIRSTSTRINGLENGTH): self.maxStringLength=maxStringLength self.firstStringLength=firstStringLength self.message="Complement Naive Bayes using TF-IDF with "+"%5d" % maxFeatures + " features " + \ " ngram-range "+"%2d" % ngramrange[0]+" to "+"%2d" % ngramrange[1] + \ " maxString Length "+ "%6d" % self.maxStringLength return def preprocess(self,x): xprocessed=[] xbegin=[] for item in x: xprocessed.append(item[0:self.maxStringLength]) xbegin.append(item[0:self.firstStringLength]) return xprocessed,xbegin def fit(self,x,y): # generate dictionary of words and numb of word occurences # in each document xprocessed,xbegin=self.preprocess(x) self.vectorizer=\ TfidfVectorizer(max_df=maxdf,min_df=mindf,max_features=maxFeatures, ngram_range=ngramrange) xv=self.vectorizer.fit_transform(xprocessed) self.nbclf=ComplementNB(alpha=alphasmooth) self.nbclf.fit(xv,y) ytrain=self.nbclf.predict(xv) return ytrain #predict for a group of x value def predict(self,x): if (len(x[0])<minLength): y=["No input"] return y try: xprocessed,xbegin=self.preprocess(x) xv=self.vectorizer.transform(xprocessed) y=self.nbclf.predict(xv) except: raise return y # Compute confidence given predicted values & return confusion matrix def confidence(self,ytest,ytestpred): conf_mat = confusion_matrix(ytest, ytestpred) # compute accuracy given predicted value labels = sorted(set(ytest)) self.confidence=dict(zip(labels, conf_mat.diagonal()/ (.1+conf_mat.sum(axis=0)))) return conf_mat # get the Confidence score for a single item: def getConfidence(self,x,y): try: return self.confidence[y] except: return -1.0;
def tune_cnb(params): alpha_ = params[0] file = args.file output_file = args.output if args.seeds == '10': seeds = [(i + 1) * 100 for i in xrange(10)] elif args.seeds == '100': seeds = [(i + 1) * 100 for i in xrange(100)] else: seeds = [100] train_size = float(args.train_size) if train_size > 1 or train_size < 0: print 'Train size invalid. Please enter a value between 0 and 1.' exit() avg_aoc = 0.0 for seed in seeds: clf = ComplementNB(alpha=alpha_) df = pd.read_csv(file) #create train/test df['is_train'] = np.random.uniform(0, 1, len(df)) <= train_size train, test = df[df['is_train'] == True], df[df['is_train'] == False] #set list of features features = df.columns[1:-2] #set dependent variable dep = 'Security' y = train['Security'] clf.fit(train[features], y) clf.predict(test[features]) preds = clf.predict(test[features]) avg_aoc = avg_aoc + roc_auc_score(test['Security'], preds) #End for Seed in Seeds return (1 - avg_aoc / int(args.seeds) ) #scipy DE minimizes functions; need to take inverse
def ComplementNB_classification(train, test, train_labels, test_labels, res={}): """ :param train: training data, iterable/list :param test: testing data, iterable/list :param train_labels: training labels, iterable/list :param test_labels: testing labels, iterable/list :return: / --> Saves data in folder "Results" """ print("Classifying with Complement Nive Bayes...") complNB = ComplementNB() complNB.fit(train, train_labels) prediction = complNB.predict(test) utils.report_and_confmat(test_labels, prediction, "ComplementNB") score = complNB.score(test, test_labels) res["ComplementNB"] = { "model": complNB, "accuracy": score, "name": "ComplementNB" } print("Complement ended...") return score, complNB
class ComplementNBImpl(): def __init__(self, alpha=1.0, fit_prior=True, class_prior=None, norm=False): self._hyperparams = { 'alpha': alpha, 'fit_prior': fit_prior, 'class_prior': class_prior, 'norm': norm } def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X) def predict_proba(self, X): return self._sklearn_model.predict_proba(X)
def get_optimal_values_ComplementNB(x_train, y_train, x_val, y_val): alphas = [x / 10 for x in range(0, 11)] fit_priors = [True, False] norms = [True, False] max_score = 0 optimal_fit_prior = True optimal_alpha = 1.0 optiomal_norm = False # Evaluamos para escoger el mejor parámetro for alpha in alphas: for fit_prior in fit_priors: for norm in norms: naive = ComplementNB(alpha=alpha, fit_prior=fit_prior, norm=norm) naive.fit(x_train, y_train) y_pred = naive.predict(x_val) if max_score < accuracy_score(y_val, y_pred) * 100: optimal_alpha = alpha optimal_fit_prior = fit_prior optiomal_norm = norm max_score = accuracy_score(y_val, y_pred) * 100 print(max_score, optimal_alpha, optimal_fit_prior, optiomal_norm) return max_score, optimal_alpha, optimal_fit_prior, optiomal_norm
class bayes(object): def __init__(self, data, target, algorithm="GNB"): self.algorithm = algorithm self.data = data self.target = target if algorithm == 'GNB': self.model = GaussianNB() elif algorithm == 'MNB': self.model = MultinomialNB() elif algorithm == 'BNB': self.model = BernoulliNB() else: self.model = ComplementNB() self.model.fit(data, target) def save_model(self, path): _joblib.dump(self.model, path) def load_model(self, path): self.model = _joblib.load(path) def predict(self, x): res = self.model.predict(x) return res
def run_compnb(x_train, x_test, y_train, y_test, x): '''Complement Naive Bayes''' logger.info("Running ComplementNB") compnb = ComplementNB() compnb.fit(x_train, y_train) compnb_pred = compnb.predict(x_test) model_dict['compnb'] = get_model_results(compnb, x_test, y_test, compnb_pred, x) return compnb_pred
def run_Complement_Naive_Bayes(X_train, y_train, X_test, is_norm=False): print('Training model') from sklearn.naive_bayes import ComplementNB clf = ComplementNB(norm=is_norm).fit(X_train, y_train) print('Predicting on test data') predicted = clf.predict(X_test) predicted = np.asarray(predicted, dtype=np.uint64) return predicted
def CNB(train_x, train_y, test_x, test_y): #ComplementNB알고리즘 결과출력 cnb = ComplementNB() cnb.fit(train_x, train_y) pre_arr = cnb.predict(test_x) pre_arr = pre_arr.reshape(10, 12) print('ComplementNB의 테스트 세트 예측 :\n{}'.format(pre_arr)) print('ComplementNB의 테스트 세트 정확도 : {0:0.2f}%'.format( cnb.score(test_x, test_y) * 100)) print('------------------------------------------------------')
def complement_bayes(train_data, test_data): train_y = train_data['state'] train_X = train_data.iloc[:, FEATURES_INDICES] test_y = test_data['state'] test_X = test_data.iloc[:, FEATURES_INDICES] CNB = ComplementNB() CNB.fit(train_X, train_y) pred_y = CNB.predict(test_X) evaluate(CNB, test_X, test_y, pred_y)
def complementNB(tr_vec, tr_ans, val_vec, val_ans, te_vec): from sklearn.naive_bayes import ComplementNB clf = ComplementNB() clf.fit(tr_vec, tr_ans) print(clf.score(val_vec, val_ans)) print('make predictions ...') #clf_predictions = clf.predict_proba(te_vec) preds = clf.predict(te_vec) pred_test_y = (preds > 0.35).astype(int) return pred_test_y
def naive_bayes(self, name="Train_Test"): X_train, X_test, y_train, y_test = train_test_split(self.X, self.Y, test_size=0.4, random_state=0) clf = ComplementNB() clf.fit(X_train, y_train) predict = clf.predict(X_test) f, p, r = self.nbeval(y_test, predict) line = "{}: F score:{:.3f}\tP score:{:.3f}\tR score:{:.3f}.".format( name, f, p, r) self.logger.info(line)
def naive_bayes(x, y): # import complementNB,MultinomialNB cpl = ComplementNB() mnb = MultinomialNB() # train our dataset cpl.fit(x, y) mnb.fit(x, y) # perform prediction and find accuracy y_test_cpl = cpl.predict(x) y_test_mnb = mnb.predict(x) return y_test_cpl, y_test_mnb
def get_accuracy_of_selection(X, y): # create k-fold cross validation object kf = StratifiedKFold(n_splits=25, shuffle=True, random_state=None) # array of accuracy predictions for this selection of features accuracies = [] # perform a k-fold cross validation to determine accuracy of selected features for train_index, test_index in kf.split(X, y): # split into testing and training data based on the splits X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # count each occurrence of the classes to determine frequency class_count = [0, 0] for i in y_train: class_count[int(i)] += 1 # calculate total number of observations and determine prior probability total = class_count[0] + class_count[1] prior_probability = [class_count[0] / total, class_count[1] / total] # define smoothing: "portion of the largest variance of all features that is added to variances for calculation stability." smoothing = 1e-09 # perform a complement naive bayes gnb = ComplementNB(class_prior=prior_probability) gnb.fit(X_train, y_train) y_pred = gnb.predict(X_test) # predicted class # y_probs = gnb.predict_proba(X_test) # confidence in each prediction # for i in range(y_pred.shape[0]): # if y_pred[i] != y_test[i]: # print(y_probs[i]) # shows that sometimes we are really really confident in the wrong answer # determine how accurate we were size = y_test.size true_count = (y_test == y_pred).sum() accuracy_percentage = (true_count) / size # add to array of accuracy predictions for this selection of features accuracies.append(accuracy_percentage) # compute the mean and standard deviation of this selection of features mean = np.mean(accuracies) sd = np.std(accuracies) # print("MEAN: " + str(round(mean*100,2)) + "%") # print("STANDARD DEVIATION: " + str(round(sd*100,2)) + "%") return mean, sd
class NBClassifier(super.abstract_classifier): def __init__(self, train_features, train_labels): self.train_features = train_features self.train_labels = train_labels self.nb_Member = ComplementNB() def train( self): # after this function the ComplementNB is ready to classify self.nb_Member.fit(self.train_features, self.train_labels) def classify(self, newVector): return self.nb_Member.predict(newVector)
def guassian_distribution_classifier(self): Train_X_Tfidf, Test_X_Tfidf, Train_Y = self._sklearn_data_cleaning() gnb = ComplementNB(alpha=1.590) gnb.fit(Train_X_Tfidf, Train_Y) # predict the labels on validation dataset predictions_NB = gnb.predict(Test_X_Tfidf) range_list = [item for item in range(0, len(self.test_data))] final_dt = pd.DataFrame(list( zip(self.Encoder.inverse_transform(predictions_NB), range_list)), columns=['Category', 'Id']) self._csv_output_generator(final_dt, final_output_path)
def complement_bayes(x_train,x_test,y_train,y_test,X,fl,amostra_paci3,fl_a3,nome): Complement=ComplementNB() Complement.fit(x_train,y_train) pred=Complement.predict_proba(x_train) amostra_=Complement.predict_proba(amostra_paci3) amostra_2=Complement.predict(amostra_paci3) amostra_paci3['result']=0 amostra_paci3['probls']=0 amostra_paci3['probls']=amostra_ amostra_paci3['result']=amostra_2 amostra_paci3['fl_severidade']=fl_a3 amostra_paci3.to_csv('modelo_complement_bayes.csv') print('Treinamento AUC-ROC:{}'.format(roc_auc_score(y_train,pred[:,1]))) pred_2=Complement.predict_proba(x_test) print('Validacao AUC-ROC:{}'.format(roc_auc_score(y_test,pred_2[:,1]))) #print(Complement.predict_proba(X)) yhat = Complement.predict_proba(X) yhat = yhat[:, 1] print(pd.crosstab(fl, Complement.predict(X))) print(classification_report(fl, Complement.predict(X))) print('AUC: %0.2f' % roc_auc_score(fl,yhat)) plot_roc_curve(fl,yhat,nome)
def parameter_iteration_tunning(): _service = service() train_features, train_labels = _service.read_csv_data( 'dataset1/ds1/ds1Train.csv') validation_features, validation_labels = _service.read_csv_data( 'dataset1/ds1/ds1Val.csv') _range = [0.01, 0.001, 0.1, 1, 10, 100, 1000] for index in _range: _clf = ComplementNB(alpha=index) _clf.fit(train_features, train_labels) pred = _clf.predict(validation_features) print('ComplementNB accuracy ' + str(index) + ' is', accuracy_score(validation_labels, pred))
def NB_accuracy_complement(X_train, X_test, y_train, y_test, fold): gnb = ComplementNB() gnb.fit(X_train, y_train) y_pred = gnb.predict(X_test) accuracy_score(y_test, y_pred) print(classification_report(y_test, y_pred)) print(confusion_matrix(y_test, y_pred)) print ("mean_squared_error: ", mean_squared_error(y_test, y_pred)) results = cross_val_score(gnb, X_train, y_train, cv = fold) print("After 5-fold: ", results.mean()*100)
def trainv2(std=False, algo='mnb', n=None): """Train a model using Naive Bayes Parameters ---------- std : bool Standardize the data. algo : str The algorithm to use. Can be either `mnb` or `cnb` n : int Select n samples from each category. (Default: All) Returns ------- """ df = create_dataframe(n=n) counts, df = process_dataframe(df, algo=algo) ### Todo: Remove save_obj(df, 'v2_dataframe.p') save_obj(counts, 'v2_counts.p') ### # messages_train, messages_test, labels_train, labels_test x_train, x_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.3, random_state=69) if std: x_train, x_test = standardize(x_train, x_test) if algo == 'cnb': model = ComplementNB() elif algo == 'mnb': model = MultinomialNB() else: logger.critical( f'Parameter `algo` specifies unknown algorithm. Defaulting to `mnb`.' ) model = MultinomialNB() model.fit(x_train, y_train) save_model(model, version='v2', algo=algo) y_pred = model.predict(x_test) print(f'Accuracy: {accuracy_score(y_test, y_pred)}') measure.evaluate(model, x_test, y_test) measure.performance_report(y_test, y_pred) measure.plot_confusion_mat(model, x_test, y_test) title = f'Learning Curves ({algo.upper()})' learning_curve.plot(model, x_test, y_test, title=title)
def complement_nb(self, train, test): print('STARTING MULTINOMIAL NAIVE BAYES (COMPLEMENT)') trainX, trainy = self.split_xy(train) testX, testy = self.split_xy(test) model = ComplementNB() model.fit(trainX, trainy) train_pred = model.predict(trainX) test_pred = model.predict(testX) cm = confusion_matrix(trainy, train_pred) acc = (cm[0][0] + cm[1][1]) / (np.sum(cm)) print('------train evaluation------') print(cm) print(classification_report(trainy, train_pred)) print('TRAIN ACCURACY : {}\n'.format(np.round(acc, 4))) cm = confusion_matrix(testy, test_pred) acc = (cm[0][0] + cm[1][1]) / (np.sum(cm)) print('\n------test evaluation------') print(cm) print(classification_report(testy, test_pred)) print('TEST ACCURACY : {}\n\n'.format(np.round(acc, 4))) return model
def test_model(self, X_test): pickle_path1 = os.path.join("resources", "X_text_matrix.pkl") pickle_path2 = os.path.join("resources", "X_title_matrix.pkl") pickle_path3 = os.path.join("resources", "X_author_matrix.pkl") with open(pickle_path1, "rb") as output_file: X1 = pickle.load(output_file) with open(pickle_path2, "rb") as output_file2: X2 = pickle.load(output_file2) with open(pickle_path3, "rb") as output_file3: X3 = pickle.load(output_file3) print(X3[:5]) clf1 = ComplementNB().fit(X3, y_train) clf2 = ComplementNB().fit(X2, y_train) clf3 = ComplementNB().fit(X1, y_train) print(clf3) X4 = self.vectorize(X_test) test_predict = clf1.predict(X4) author_predict = np.asarray(test_predict, dtype=np.float64, order='C') X5 = self.title_vector test_predict2 = clf2.predict(X5) title_predict = np.asarray(test_predict2, dtype=np.float64, order='C') self.title_predict = title_predict X6 = self.text_vector text_predict = clf3.predict(X6) #text_predict = np.asarray(test_predict3, dtype=np.float64, order='C') self.author_predict = author_predict self.title_predict = title_predict self.text_predict = text_predict return
class GenderClassifier: def __init__(self): self._classifier = ComplementNB() self._vectorizer = DictVectorizer() def _getFeatures(self, name): name = name.lower() return { "firstL": name[0], "first2L": name[:2], "first3L": name[:3], "lastL": name[-1], "last2": name[-2:], "last3": name[-3:], "last4": name[-4:], } def preprocess(self, df): # shuffle dataset df = df.sample(frac=1, random_state=10).reset_index(drop=True) df['gender'].replace(['M', 'F'], ['0', '1'], inplace=True) y = df['gender'] X = df['name'].apply(lambda x: self._getFeatures(x)) return X, y def train(self): self._vectorizer.fit(self._X) self._classifier.fit(self._vectorizer.transform(self._X), self._y) def predict(self, name): transformed = self._vectorizer.transform(self._getFeatures(name)) predicted = self._classifier.predict(transformed) if int(predicted): return 'F' return 'M' # preprocess & train classifier def setup(self, df): self._X, self._y = self.preprocess(df) self.train() # append provided name and gender to current dataset def add_new(self, df): add_x, add_y = self.preprocess(df) self._X = self._X.append(add_x, ignore_index=True) self._y = self._y.append(add_y, ignore_index=True)
def train(std=False, algo='mnb'): """Train a model using Naive Bayes Parameters ---------- std : bool Standardize the data algo : str The algorithm to use. Can be either `mnb` or `cnb` Returns ------- """ dictionary = make_dictionary() features, labels = make_dataset(dictionary) ### Todo: Remove save_obj(features, 'v1_features.p') save_obj(labels, 'v1_labels.p') ### x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=69) if std: x_train, x_test = standardize(x_train, x_test) if algo == 'cnb': model = ComplementNB() elif algo == 'mnb': model = MultinomialNB() else: logger.critical( f'Parameter `algo` specifies unknown algorithm. Defaulting to `mnb`.' ) model = MultinomialNB() model.fit(x_train, y_train) save_model(model, version='v1', algo=algo) y_pred = model.predict(x_test) print(f'Accuracy: {accuracy_score(y_test, y_pred)}') measure.evaluate(model, x_test, y_test) measure.performance_report(y_test, y_pred) measure.plot_confusion_mat(model, x_test, y_test) title = f'Learning Curves ({algo.upper()})' learning_curve.plot(model, x_test, y_test, title=title)
def cnb(X_train, Y_train, X_test, Y_test): ##################### CNB ###################### classifier = ComplementNB() #ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False) classifier.fit(X_train, Y_train) y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(Y_test, y_pred) if len(cm[0]) == 2: total_correct_predictions = cm[0, 0] + cm[1, 1] elif len(cm[0]) == 3: total_correct_predictions = cm[0, 0] + cm[1, 1] + cm[2, 2] total_predictions_made = np.sum(cm) accuracy = total_correct_predictions / total_predictions_made * 100 return accuracy