def _complementnb(*,
                  train,
                  test,
                  x_predict=None,
                  metrics,
                  alpha=1.0,
                  fit_prior=True,
                  class_prior=None,
                  norm=False):
    """For for info visit : 
        https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.ComplementNB.html#sklearn.naive_bayes.ComplementNB
    """

    model = ComplementNB(alpha=alpha,
                         fit_prior=fit_prior,
                         class_prior=class_prior,
                         norm=norm)
    model.fit(train[0], train[1])
    model_name = 'ComplementNB'
    y_hat = model.predict(test[0])

    if metrics == 'f1_score':
        accuracy = f1_score(test[1], y_hat)
    if metrics == 'jaccard_score':
        accuracy = jaccard_score(test[1], y_hat)
    if metrics == 'accuracy_score':
        accuracy = accuracy_score(test[1], y_hat)

    if x_predict is None:
        return (model_name, accuracy, None)

    y_predict = model.predict(x_predict)
    return (model_name, accuracy, y_predict)
示例#2
0
class CNBTwoStepClassifier(ImbalancedTrainerInterface):
    def __init__(self, alpha=1):
        self.alpha = alpha
        self.clf_yn = ComplementNB(alpha=alpha)
        self.clf_n = ComplementNB(alpha=alpha)
    
    def fit(self, X_train, y_train):
        X_train = X_train.toarray().tolist()
        y_train = pd.Series(y_train)
        max_class = self._find_dominant_class(X_train, y_train)
        x_w, x_o, y_w, y_o = self._partition(X_train, y_train, max_class[0])
        x_yn = x_w + x_o
        y_yn = y_w + ['not'] * len(y_o)
        # print(y_yn)
        # print(y_o)
        self.clf_yn.fit(x_yn, y_yn)
        self.clf_n.fit(x_o, y_o)
    
    def predict(self, X_test):
        y_pred_yn = self.clf_yn.predict(X_test)
        y_pred_total = []
        for p,x in list(zip(y_pred_yn, X_test)):
            if p == 'not':
                y_pred_total.append(self.clf_n.predict(x)[0])
            else:
                y_pred_total.append(p)
        return y_pred_total
    
    def score(self, X_test, y_test):
        y_test = pd.Series(y_test)
        y_pred = self.predict(X_test)
        acc = np.mean(y_pred == y_test)
        return acc
示例#3
0
def main(args):
    model_name = args.model_name
    model_dir = os.path.join(args.root, "model")  # get model dir
    data_dir = os.path.join(args.root, "data")  # get data dir

    data_path = os.path.join(data_dir, args.inFile)
    print('load data from' + data_path)

    data = pickle.load(open(data_path, 'rb'))
    out_path = os.path.join(data_dir, args.outFileName + '.csv')
    assert 'data' in data
    if args.train:
        ratio = args.ratio
        clf = ComplementNB(alpha=args.alpha,
                           fit_prior=args.fit_prior,
                           norm=args.norm)

        assert 'target' in data

        features = data['data']
        labels = data['target']

        rs = ShuffleSplit(n_splits=1, test_size=ratio)
        train_index, val_index = next(rs.split(features, labels))

        x_train = features[train_index]
        x_test = features[val_index]

        y_train = labels[train_index]
        y_test = labels[val_index]

        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)

        # The accuracy
        print('Accuracy: \n', accuracy_score(y_test, y_pred))

        df = pd.DataFrame({
            'pred': y_pred,
            'target': y_test,
        })
        print(f'validation results save to:{args.outFileName}.csv')
        df.to_csv(out_path)
        print("Some results of validation:")
        print(df.head())

        model_path = os.path.join(model_dir, f'{model_name}_{model}.model')
        dump(clf, model_path)
    else:
        # TODO: How to Save the prediction?
        model_path = os.path.join(model_dir, args.model_path)
        clf = load(args.model)
        x = data['data']
        pred = clf.predict(x)
        df = pd.DataFrame({
            'pred': pred,
        })
        df.to_csv(out_path)
class NaiveBayes():
    def __init__(self, division="sents", ngram=1):

        self.df_train = pd.read_csv(f"../data/{division}_train.csv",
                                    sep='\t',
                                    names=['sentence', 'author', 'work'])
        self.df_val = pd.read_csv(f"../data/{division}_val.csv",
                                  sep='\t',
                                  names=['sentence', 'author', 'work'])
        self.df_test = pd.read_csv(f"../data/{division}_test.csv",
                                   sep='\t',
                                   names=['sentence', 'author', 'work'])

        self.df_spurious = pd.read_csv(f"../data/{division}_spurious.csv",
                                       sep='\t',
                                       names=['sentence', 'work'])
        self.df_epistles = self.df_spurious[self.df_spurious['work'] == 36]
        self.df_spurious = self.df_spurious[self.df_spurious['work'] != 36]

        self.tfidf = TfidfVectorizer(lowercase=False,
                                     stop_words=list(
                                         map(strip_accents, STOPS_LIST)),
                                     ngram_range=(1, ngram))

        self.tfidf_train = self.tfidf.fit_transform(self.df_train['sentence'])
        self.tfidf_val = self.tfidf.transform(self.df_val['sentence'])
        self.tfidf_test = self.tfidf.transform(self.df_test['sentence'])
        self.tfidf_spurious = self.tfidf.transform(
            self.df_spurious['sentence'])
        self.tfidf_epistles = self.tfidf.transform(
            self.df_epistles['sentence'])

        self.label = LabelEncoder()
        self.author_train = self.label.fit_transform(self.df_train['author'])

        self.author_val = self.label.transform(self.df_val['author'])
        self.author_test = self.label.transform(self.df_test['author'])

        self.nb = ComplementNB()
        self.nb.fit(self.tfidf_train, self.author_train)

    def eval(self):
        author_train_pred = self.nb.predict(self.tfidf_train)
        author_val_pred = self.nb.predict(self.tfidf_val)
        author_test_pred = self.nb.predict(self.tfidf_test)

        print(classification_report(self.author_train, author_train_pred))
        print(classification_report(self.author_val, author_val_pred))

    def predict(self):
        epistles_labels = self.label.inverse_transform(
            self.nb.predict(self.tfidf_epistles))
        print((epistles_labels == "Plato").mean())
        print(epistles_labels)

        spurious_labels = self.label.inverse_transform(
            self.nb.predict(self.tfidf_spurious))
        print((spurious_labels == "Plato").mean())
def train_complement_naivebayes(params,
                                x_train,
                                y_train,
                                n_folds,
                                random_state,
                                stratified=True,
                                shuffle=True):

    # Model and hyperparameter selection
    if stratified:
        kf = StratifiedKFold(n_splits=n_folds,
                             random_state=random_state,
                             shuffle=shuffle)
    else:
        kf = KFold(n_splits=n_folds,
                   random_state=random_state,
                   shuffle=shuffle)

    cnb_model = ComplementNB(**params)
    i = 0

    # Model Training
    for (train_index, test_index) in kf.split(x_train, y_train):
        # cross-validation randomly splits train data into train and validation data
        print('\n Fold %d' % (i + 1))

        x_train_cv, x_val_cv = x_train.iloc[train_index], x_train.iloc[
            test_index]
        y_train_cv, y_val_cv = y_train.iloc[train_index], y_train.iloc[
            test_index]

        # declare your model
        cnb_model.fit(x_train_cv, y_train_cv)

        # predict train and validation set accuracy and get eval metrics
        scores_cv = cnb_model.predict(x_train_cv)
        scores_val = cnb_model.predict(x_val_cv)

        # training evaluation
        train_pc = accuracy_score(y_train_cv, scores_cv)
        train_pp = precision_score(y_train_cv, scores_cv)
        train_re = recall_score(y_train_cv, scores_cv)
        print('\n train-Accuracy: %.6f' % train_pc)
        print(' train-Precision: %.6f' % train_pp)
        print(' train-Recall: %.6f' % train_re)

        eval_pc = accuracy_score(y_val_cv, scores_val)
        eval_pp = precision_score(y_val_cv, scores_val)
        eval_re = recall_score(y_val_cv, scores_val)
        print('\n eval-Accuracy: %.6f' % eval_pc)
        print(' eval-Precision: %.6f' % eval_pp)
        print(' eval-Recall: %.6f' % eval_re)

        i = i + 1

    # return model for evaluation and prediction
    return cnb_model
示例#6
0
class DocClfTfidfCNB():
    def __init__(self,maxStringLength=MAXSTRINGLENGH, \
                 firstStringLength=FIRSTSTRINGLENGTH):
        self.maxStringLength=maxStringLength
        self.firstStringLength=firstStringLength
        self.message="Complement Naive Bayes using TF-IDF with "+"%5d" % maxFeatures + " features " + \
        " ngram-range "+"%2d" % ngramrange[0]+" to "+"%2d" % ngramrange[1] + \
        " maxString Length "+ "%6d" % self.maxStringLength
       
        return
    def preprocess(self,x):
        xprocessed=[]
        xbegin=[]
        for item in x:
            xprocessed.append(item[0:self.maxStringLength])
            xbegin.append(item[0:self.firstStringLength])
        return xprocessed,xbegin
    def fit(self,x,y):
            # generate dictionary of words and numb of word occurences
    # in each document
        xprocessed,xbegin=self.preprocess(x)
        self.vectorizer=\
        TfidfVectorizer(max_df=maxdf,min_df=mindf,max_features=maxFeatures,
                               ngram_range=ngramrange)
        xv=self.vectorizer.fit_transform(xprocessed)
        self.nbclf=ComplementNB(alpha=alphasmooth)
        self.nbclf.fit(xv,y)
        ytrain=self.nbclf.predict(xv)
        return ytrain
    
    #predict for a group of x value
    def predict(self,x):
        if (len(x[0])<minLength):
            y=["No input"]
            return y
        try:
            xprocessed,xbegin=self.preprocess(x)
            xv=self.vectorizer.transform(xprocessed)
            y=self.nbclf.predict(xv)
        except:
            raise
        return y
    
    # Compute confidence given predicted values & return confusion matrix
    def confidence(self,ytest,ytestpred):
        conf_mat = confusion_matrix(ytest, ytestpred)
    # compute accuracy given predicted value
        labels = sorted(set(ytest))
        self.confidence=dict(zip(labels, conf_mat.diagonal()/
                                 (.1+conf_mat.sum(axis=0))))
        return conf_mat
    # get the Confidence score for a single item:
    def getConfidence(self,x,y):
        try:
            return self.confidence[y]
        except:
            return -1.0;        
def tune_cnb(params):

    alpha_ = params[0]

    file = args.file

    output_file = args.output

    if args.seeds == '10':
        seeds = [(i + 1) * 100 for i in xrange(10)]
    elif args.seeds == '100':
        seeds = [(i + 1) * 100 for i in xrange(100)]
    else:
        seeds = [100]

    train_size = float(args.train_size)
    if train_size > 1 or train_size < 0:
        print 'Train size invalid. Please enter a value between 0 and 1.'
        exit()

    avg_aoc = 0.0

    for seed in seeds:

        clf = ComplementNB(alpha=alpha_)

        df = pd.read_csv(file)

        #create train/test
        df['is_train'] = np.random.uniform(0, 1, len(df)) <= train_size
        train, test = df[df['is_train'] == True], df[df['is_train'] == False]

        #set list of features
        features = df.columns[1:-2]

        #set dependent variable
        dep = 'Security'

        y = train['Security']

        clf.fit(train[features], y)
        clf.predict(test[features])

        preds = clf.predict(test[features])

        avg_aoc = avg_aoc + roc_auc_score(test['Security'], preds)

    #End for Seed in Seeds
    return (1 - avg_aoc / int(args.seeds)
            )  #scipy DE minimizes functions; need to take inverse
def ComplementNB_classification(train,
                                test,
                                train_labels,
                                test_labels,
                                res={}):
    """

    :param train: training data, iterable/list
    :param test: testing data, iterable/list
    :param train_labels: training labels, iterable/list
    :param test_labels: testing labels, iterable/list
    :return: / --> Saves data in folder "Results"
    """
    print("Classifying with Complement Nive Bayes...")

    complNB = ComplementNB()
    complNB.fit(train, train_labels)

    prediction = complNB.predict(test)
    utils.report_and_confmat(test_labels, prediction, "ComplementNB")
    score = complNB.score(test, test_labels)

    res["ComplementNB"] = {
        "model": complNB,
        "accuracy": score,
        "name": "ComplementNB"
    }
    print("Complement ended...")
    return score, complNB
示例#9
0
class ComplementNBImpl():
    def __init__(self,
                 alpha=1.0,
                 fit_prior=True,
                 class_prior=None,
                 norm=False):
        self._hyperparams = {
            'alpha': alpha,
            'fit_prior': fit_prior,
            'class_prior': class_prior,
            'norm': norm
        }

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)

    def predict_proba(self, X):
        return self._sklearn_model.predict_proba(X)
示例#10
0
def get_optimal_values_ComplementNB(x_train, y_train, x_val, y_val):
    alphas = [x / 10 for x in range(0, 11)]
    fit_priors = [True, False]
    norms = [True, False]
    max_score = 0
    optimal_fit_prior = True
    optimal_alpha = 1.0
    optiomal_norm = False

    # Evaluamos para escoger el mejor parámetro
    for alpha in alphas:
        for fit_prior in fit_priors:
            for norm in norms:
                naive = ComplementNB(alpha=alpha,
                                     fit_prior=fit_prior,
                                     norm=norm)
                naive.fit(x_train, y_train)
                y_pred = naive.predict(x_val)
                if max_score < accuracy_score(y_val, y_pred) * 100:
                    optimal_alpha = alpha
                    optimal_fit_prior = fit_prior
                    optiomal_norm = norm
                    max_score = accuracy_score(y_val, y_pred) * 100
    print(max_score, optimal_alpha, optimal_fit_prior, optiomal_norm)
    return max_score, optimal_alpha, optimal_fit_prior, optiomal_norm
示例#11
0
class bayes(object):
    def __init__(self, data, target, algorithm="GNB"):
        self.algorithm = algorithm
        self.data = data
        self.target = target
        if algorithm == 'GNB':
            self.model = GaussianNB()
        elif algorithm == 'MNB':
            self.model = MultinomialNB()
        elif algorithm == 'BNB':
            self.model = BernoulliNB()
        else:
            self.model = ComplementNB()

        self.model.fit(data, target)

    def save_model(self, path):
        _joblib.dump(self.model, path)

    def load_model(self, path):
        self.model = _joblib.load(path)

    def predict(self, x):
        res = self.model.predict(x)
        return res
def run_compnb(x_train, x_test, y_train, y_test, x):
    '''Complement Naive Bayes'''
    logger.info("Running ComplementNB")
    compnb = ComplementNB()
    compnb.fit(x_train, y_train)
    compnb_pred = compnb.predict(x_test)
    model_dict['compnb'] = get_model_results(compnb, x_test, y_test,
                                             compnb_pred, x)
    return compnb_pred
def run_Complement_Naive_Bayes(X_train, y_train, X_test, is_norm=False):
    print('Training model')
    from sklearn.naive_bayes import ComplementNB
    clf = ComplementNB(norm=is_norm).fit(X_train, y_train)

    print('Predicting on test data')
    predicted = clf.predict(X_test)
    predicted = np.asarray(predicted, dtype=np.uint64)

    return predicted
def CNB(train_x, train_y, test_x, test_y):  #ComplementNB알고리즘 결과출력
    cnb = ComplementNB()
    cnb.fit(train_x, train_y)
    pre_arr = cnb.predict(test_x)
    pre_arr = pre_arr.reshape(10, 12)

    print('ComplementNB의 테스트 세트 예측 :\n{}'.format(pre_arr))
    print('ComplementNB의 테스트 세트 정확도 : {0:0.2f}%'.format(
        cnb.score(test_x, test_y) * 100))
    print('------------------------------------------------------')
示例#15
0
def complement_bayes(train_data, test_data):
    train_y = train_data['state']
    train_X = train_data.iloc[:, FEATURES_INDICES]

    test_y = test_data['state']
    test_X = test_data.iloc[:, FEATURES_INDICES]
    CNB = ComplementNB()
    CNB.fit(train_X, train_y)
    pred_y = CNB.predict(test_X)
    evaluate(CNB, test_X, test_y, pred_y)
示例#16
0
def complementNB(tr_vec, tr_ans, val_vec, val_ans, te_vec):
    from sklearn.naive_bayes import ComplementNB
    clf = ComplementNB()
    clf.fit(tr_vec, tr_ans)
    print(clf.score(val_vec, val_ans))

    print('make predictions ...')
    #clf_predictions = clf.predict_proba(te_vec)
    preds = clf.predict(te_vec)
    pred_test_y = (preds > 0.35).astype(int)
    return pred_test_y
 def naive_bayes(self, name="Train_Test"):
     X_train, X_test, y_train, y_test = train_test_split(self.X,
                                                         self.Y,
                                                         test_size=0.4,
                                                         random_state=0)
     clf = ComplementNB()
     clf.fit(X_train, y_train)
     predict = clf.predict(X_test)
     f, p, r = self.nbeval(y_test, predict)
     line = "{}: F score:{:.3f}\tP score:{:.3f}\tR score:{:.3f}.".format(
         name, f, p, r)
     self.logger.info(line)
示例#18
0
def naive_bayes(x, y):
    # import complementNB,MultinomialNB
    cpl = ComplementNB()
    mnb = MultinomialNB()
    # train our dataset
    cpl.fit(x, y)
    mnb.fit(x, y)
    # perform prediction and find accuracy
    y_test_cpl = cpl.predict(x)
    y_test_mnb = mnb.predict(x)

    return y_test_cpl, y_test_mnb
示例#19
0
def get_accuracy_of_selection(X, y):
    # create k-fold cross validation object
    kf = StratifiedKFold(n_splits=25, shuffle=True, random_state=None)

    # array of accuracy predictions for this selection of features
    accuracies = []

    # perform a k-fold cross validation to determine accuracy of selected features
    for train_index, test_index in kf.split(X, y):
        # split into testing and training data based on the splits
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # count each occurrence of the classes to determine frequency
        class_count = [0, 0]
        for i in y_train:
            class_count[int(i)] += 1

        # calculate total number of observations and determine prior probability
        total = class_count[0] + class_count[1]
        prior_probability = [class_count[0] / total, class_count[1] / total]

        # define smoothing: "portion of the largest variance of all features that is added to variances for calculation stability."
        smoothing = 1e-09

        # perform a complement naive bayes
        gnb = ComplementNB(class_prior=prior_probability)
        gnb.fit(X_train, y_train)

        y_pred = gnb.predict(X_test)  # predicted class

        # y_probs = gnb.predict_proba(X_test)  # confidence in each prediction

        # for i in range(y_pred.shape[0]):
        #     if y_pred[i] != y_test[i]:
        #         print(y_probs[i])       # shows that sometimes we are really really confident in the wrong answer

        # determine how accurate we were
        size = y_test.size
        true_count = (y_test == y_pred).sum()
        accuracy_percentage = (true_count) / size

        # add to array of accuracy predictions for this selection of features
        accuracies.append(accuracy_percentage)

    # compute the mean and standard deviation of this selection of features
    mean = np.mean(accuracies)
    sd = np.std(accuracies)

    # print("MEAN: " + str(round(mean*100,2)) + "%")
    # print("STANDARD DEVIATION: " + str(round(sd*100,2)) + "%")

    return mean, sd
示例#20
0
class NBClassifier(super.abstract_classifier):
    def __init__(self, train_features, train_labels):
        self.train_features = train_features
        self.train_labels = train_labels
        self.nb_Member = ComplementNB()

    def train(
            self):  # after this function the ComplementNB is ready to classify
        self.nb_Member.fit(self.train_features, self.train_labels)

    def classify(self, newVector):
        return self.nb_Member.predict(newVector)
    def guassian_distribution_classifier(self):

        Train_X_Tfidf, Test_X_Tfidf, Train_Y = self._sklearn_data_cleaning()
        gnb = ComplementNB(alpha=1.590)
        gnb.fit(Train_X_Tfidf, Train_Y)
        # predict the labels on validation dataset
        predictions_NB = gnb.predict(Test_X_Tfidf)

        range_list = [item for item in range(0, len(self.test_data))]
        final_dt = pd.DataFrame(list(
            zip(self.Encoder.inverse_transform(predictions_NB), range_list)),
                                columns=['Category', 'Id'])
        self._csv_output_generator(final_dt, final_output_path)
def complement_bayes(x_train,x_test,y_train,y_test,X,fl,amostra_paci3,fl_a3,nome):
    Complement=ComplementNB()
    Complement.fit(x_train,y_train)
    pred=Complement.predict_proba(x_train)
    amostra_=Complement.predict_proba(amostra_paci3)
    amostra_2=Complement.predict(amostra_paci3)
    amostra_paci3['result']=0
    amostra_paci3['probls']=0
    amostra_paci3['probls']=amostra_
    amostra_paci3['result']=amostra_2
    amostra_paci3['fl_severidade']=fl_a3
    amostra_paci3.to_csv('modelo_complement_bayes.csv')
    print('Treinamento AUC-ROC:{}'.format(roc_auc_score(y_train,pred[:,1])))
    pred_2=Complement.predict_proba(x_test)
    print('Validacao AUC-ROC:{}'.format(roc_auc_score(y_test,pred_2[:,1])))
    #print(Complement.predict_proba(X))
    yhat = Complement.predict_proba(X)
    yhat = yhat[:, 1] 
    print(pd.crosstab(fl, Complement.predict(X)))
    print(classification_report(fl, Complement.predict(X)))
    print('AUC: %0.2f' % roc_auc_score(fl,yhat))
    plot_roc_curve(fl,yhat,nome)
def parameter_iteration_tunning():
    _service = service()
    train_features, train_labels = _service.read_csv_data(
        'dataset1/ds1/ds1Train.csv')
    validation_features, validation_labels = _service.read_csv_data(
        'dataset1/ds1/ds1Val.csv')
    _range = [0.01, 0.001, 0.1, 1, 10, 100, 1000]
    for index in _range:
        _clf = ComplementNB(alpha=index)
        _clf.fit(train_features, train_labels)
        pred = _clf.predict(validation_features)
        print('ComplementNB accuracy ' + str(index) + ' is',
              accuracy_score(validation_labels, pred))
示例#24
0
def NB_accuracy_complement(X_train, X_test, y_train, y_test, fold):
    gnb = ComplementNB()
    gnb.fit(X_train, y_train)

    y_pred = gnb.predict(X_test)

    accuracy_score(y_test, y_pred)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print ("mean_squared_error: ", mean_squared_error(y_test, y_pred))

    results = cross_val_score(gnb, X_train, y_train, cv = fold)
    print("After 5-fold: ", results.mean()*100)
示例#25
0
def trainv2(std=False, algo='mnb', n=None):
    """Train a model using Naive Bayes

        Parameters
        ----------
        std : bool
            Standardize the data.

        algo : str
            The algorithm to use. Can be either `mnb` or `cnb`
        
        n : int
            Select n samples from each category. (Default: All)

        Returns
        -------
    """
    df = create_dataframe(n=n)
    counts, df = process_dataframe(df, algo=algo)
    ### Todo: Remove
    save_obj(df, 'v2_dataframe.p')
    save_obj(counts, 'v2_counts.p')
    ###
    # messages_train, messages_test, labels_train, labels_test
    x_train, x_test, y_train, y_test = train_test_split(counts,
                                                        df['label'],
                                                        test_size=0.3,
                                                        random_state=69)
    if std:
        x_train, x_test = standardize(x_train, x_test)
    if algo == 'cnb':
        model = ComplementNB()
    elif algo == 'mnb':
        model = MultinomialNB()
    else:
        logger.critical(
            f'Parameter `algo` specifies unknown algorithm. Defaulting to `mnb`.'
        )
        model = MultinomialNB()

    model.fit(x_train, y_train)
    save_model(model, version='v2', algo=algo)

    y_pred = model.predict(x_test)

    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    measure.evaluate(model, x_test, y_test)
    measure.performance_report(y_test, y_pred)
    measure.plot_confusion_mat(model, x_test, y_test)
    title = f'Learning Curves ({algo.upper()})'
    learning_curve.plot(model, x_test, y_test, title=title)
示例#26
0
    def complement_nb(self, train, test):
        print('STARTING MULTINOMIAL NAIVE BAYES (COMPLEMENT)')
        trainX, trainy = self.split_xy(train)
        testX, testy = self.split_xy(test)

        model = ComplementNB()
        model.fit(trainX, trainy)
        train_pred = model.predict(trainX)
        test_pred = model.predict(testX)
        cm = confusion_matrix(trainy, train_pred)
        acc = (cm[0][0] + cm[1][1]) / (np.sum(cm))
        print('------train evaluation------')
        print(cm)
        print(classification_report(trainy, train_pred))
        print('TRAIN ACCURACY : {}\n'.format(np.round(acc, 4)))
        cm = confusion_matrix(testy, test_pred)
        acc = (cm[0][0] + cm[1][1]) / (np.sum(cm))
        print('\n------test evaluation------')
        print(cm)
        print(classification_report(testy, test_pred))
        print('TEST ACCURACY : {}\n\n'.format(np.round(acc, 4)))

        return model
示例#27
0
    def test_model(self, X_test):
        pickle_path1 = os.path.join("resources", "X_text_matrix.pkl")
        pickle_path2 = os.path.join("resources", "X_title_matrix.pkl")
        pickle_path3 = os.path.join("resources", "X_author_matrix.pkl")
        with open(pickle_path1, "rb") as output_file:
            X1 = pickle.load(output_file)

        with open(pickle_path2, "rb") as output_file2:
            X2 = pickle.load(output_file2)

        with open(pickle_path3, "rb") as output_file3:
            X3 = pickle.load(output_file3)
        print(X3[:5])

        clf1 = ComplementNB().fit(X3, y_train)
        clf2 = ComplementNB().fit(X2, y_train)
        clf3 = ComplementNB().fit(X1, y_train)
        print(clf3)

        X4 = self.vectorize(X_test)
        test_predict = clf1.predict(X4)
        author_predict = np.asarray(test_predict, dtype=np.float64, order='C')

        X5 = self.title_vector
        test_predict2 = clf2.predict(X5)
        title_predict = np.asarray(test_predict2, dtype=np.float64, order='C')
        self.title_predict = title_predict
        X6 = self.text_vector

        text_predict = clf3.predict(X6)
        #text_predict = np.asarray(test_predict3, dtype=np.float64, order='C')

        self.author_predict = author_predict
        self.title_predict = title_predict
        self.text_predict = text_predict
        return
示例#28
0
class GenderClassifier:
    def __init__(self):
        self._classifier = ComplementNB()
        self._vectorizer = DictVectorizer()

    def _getFeatures(self, name):
        name = name.lower()
        return {
            "firstL": name[0],
            "first2L": name[:2],
            "first3L": name[:3],
            "lastL": name[-1],
            "last2": name[-2:],
            "last3": name[-3:],
            "last4": name[-4:],
        }

    def preprocess(self, df):

        # shuffle dataset
        df = df.sample(frac=1, random_state=10).reset_index(drop=True)

        df['gender'].replace(['M', 'F'], ['0', '1'], inplace=True)
        y = df['gender']
        X = df['name'].apply(lambda x: self._getFeatures(x))
        return X, y

    def train(self):
        self._vectorizer.fit(self._X)
        self._classifier.fit(self._vectorizer.transform(self._X), self._y)

    def predict(self, name):
        transformed = self._vectorizer.transform(self._getFeatures(name))
        predicted = self._classifier.predict(transformed)
        if int(predicted):
            return 'F'
        return 'M'

    # preprocess & train classifier
    def setup(self, df):
        self._X, self._y = self.preprocess(df)
        self.train()

    # append provided name and gender to current dataset
    def add_new(self, df):
        add_x, add_y = self.preprocess(df)
        self._X = self._X.append(add_x, ignore_index=True)
        self._y = self._y.append(add_y, ignore_index=True)
示例#29
0
def train(std=False, algo='mnb'):
    """Train a model using Naive Bayes

        Parameters
        ----------
        std : bool
            Standardize the data

        algo : str
            The algorithm to use. Can be either `mnb` or `cnb`

        Returns
        -------
    """
    dictionary = make_dictionary()
    features, labels = make_dataset(dictionary)
    ### Todo: Remove
    save_obj(features, 'v1_features.p')
    save_obj(labels, 'v1_labels.p')
    ###

    x_train, x_test, y_train, y_test = train_test_split(features,
                                                        labels,
                                                        test_size=0.3,
                                                        random_state=69)

    if std:
        x_train, x_test = standardize(x_train, x_test)
    if algo == 'cnb':
        model = ComplementNB()
    elif algo == 'mnb':
        model = MultinomialNB()
    else:
        logger.critical(
            f'Parameter `algo` specifies unknown algorithm. Defaulting to `mnb`.'
        )
        model = MultinomialNB()

    model.fit(x_train, y_train)
    save_model(model, version='v1', algo=algo)

    y_pred = model.predict(x_test)
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    measure.evaluate(model, x_test, y_test)
    measure.performance_report(y_test, y_pred)
    measure.plot_confusion_mat(model, x_test, y_test)
    title = f'Learning Curves ({algo.upper()})'
    learning_curve.plot(model, x_test, y_test, title=title)
示例#30
0
 def cnb(X_train, Y_train, X_test, Y_test):
     ##################### CNB ######################
     classifier = ComplementNB()
     #ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)
     classifier.fit(X_train, Y_train)
     y_pred = classifier.predict(X_test)
     # Making the Confusion Matrix
     from sklearn.metrics import confusion_matrix
     cm = confusion_matrix(Y_test, y_pred)
     if len(cm[0]) == 2:
         total_correct_predictions = cm[0, 0] + cm[1, 1]
     elif len(cm[0]) == 3:
         total_correct_predictions = cm[0, 0] + cm[1, 1] + cm[2, 2]
     total_predictions_made = np.sum(cm)
     accuracy = total_correct_predictions / total_predictions_made * 100
     return accuracy