示例#1
0
    def __init__(self, MODEL, train_x, train_y, test_x, test_y):

        #---data---#
        self.train_x = train_x
        self.train_y = train_y
        self.test_x = test_x
        self.test_y = test_y

        #---model---#
        self.cross_validate = False
        self.MODEL = MODEL

        if self.MODEL == 'NEWS':
            self.models = {
                'Guassian': GaussianNB(),
                'Multinominal': MultinomialNB(alpha=0.065),
                'Complement': ComplementNB(alpha=0.136),
                'Bernoulli': BernoulliNB(alpha=0.002)
            }
        if self.MODEL == 'MUSHROOM':
            ALPHAS = ALPHAS_MUSHROOM
            self.models = {
                'Guassian': GaussianNB(),
                'Multinominal': MultinomialNB(alpha=0.0001),
                'Complement': ComplementNB(alpha=0.0001),
                'Bernoulli': BernoulliNB(alpha=0.0001)
            }
        if self.MODEL == 'INCOME':
            self.cross_validate = True
            self.models = {
                'Guassian': GaussianNB(),
                'Multinominal': MultinomialNB(alpha=0.959),
                'Complement': ComplementNB(alpha=0.16),
                'Bernoulli': BernoulliNB(alpha=0.001)
            }
def build_pipeline(train, dev, clf):
    x_train, y_train = prepare_set(train)
    x_dev, y_dev = prepare_set(dev)

    bow = CountVectorizer(max_features=2000)
    tfidf = TfidfTransformer()

    if clf == "SVM":
        clf = SVC(C=10, gamma=1, kernel="rbf")
    elif clf == "RF":
        clf = RandomForestClassifier(n_estimators=25, max_depth=35)
    elif clf == "NB":
        clf = ComplementNB(norm=False)
    elif clf == "R":
        clf = RecallBiasedEstimator([
            SVC(C=10, gamma=1, kernel="rbf"),
            RandomForestClassifier(n_estimators=10, max_depth=35),
            ComplementNB(norm=False)
        ])

    pipeline = Pipeline([('bow', bow), ('tfidf', tfidf), ('clf', clf)])

    pipeline.fit(x_train, y_train)

    return pipeline
示例#3
0
    def __init__(self, col_stats, data_type=None):
        """
        Chose the algorithm to use for the rest of the model
        As of right now we go with ComplementNB
        """
        self._X_buff = []
        self._Y_buff = []
        self._predicted_buckets_buff = []
        self._real_buckets_buff = []
        self._original_real_buckets_buff = []
        self._original_predicted_buckets_buff = []

        self.col_stats = col_stats

        if 'percentage_buckets' in col_stats:
            self._probabilistic_model = MultinomialNB(
                alpha=self._smoothing_factor)

            self.buckets = col_stats['percentage_buckets']
            self.bucket_keys = [i for i in range(len(self.buckets))]

            if len(self.buckets) < 3:
                self._probabilistic_model = ComplementNB(
                    alpha=self._smoothing_factor)
        else:
            self._probabilistic_model = ComplementNB(
                alpha=self._smoothing_factor)

            self.buckets = None

        self.data_type = col_stats['data_type']

        self.bucket_accuracy = {}
def predict_classifier(name_dataset, name_train, classifier, name_test,
                       metric):
    """Run classifier"""
    if classifier == "ada_boost":
        estimator = AdaBoostClassifier(random_state=42,
                                       base_estimator=ComplementNB(alpha=0.01))
        #estimator = AdaBoostClassifier(random_state=42, base_estimator= LogisticRegression(C= 50, max_iter= 100))

    elif classifier == "extra_tree":
        estimator = ExtraTreesClassifier(random_state=SEED)

    elif classifier == "knn":
        estimator = KNeighborsClassifier()

    elif classifier == "logistic_regression":
        estimator = LogisticRegression(random_state=SEED)

    elif classifier == "naive_bayes":
        estimator = MultinomialNB()

    elif classifier == "naive_bayes_complement":
        estimator = ComplementNB()

    elif classifier == "passive_aggressive":
        estimator = PassiveAggressiveClassifier(random_state=SEED,
                                                max_iter=1000)

    elif classifier == "random_forest":
        estimator = RandomForestClassifier(random_state=SEED)

    elif classifier == "sgd":
        estimator = SGDClassifier(random_state=SEED, max_iter=1000)

    elif classifier == "svm":
        estimator = svm.LinearSVC(random_state=SEED, max_iter=1000)

    x_train, y_train, x_test, y_test = load_svmlight_files(
        [open(name_train, 'rb'), open(name_test, 'rb')])

    load_estimator = False
    if load_estimator == True:
        joblib.load("escores/grid_" + name_dataset + "_" +
                    classifier)  # load estimator
    else:
        if not (len(classifier.split(",")) > 1):
            escores = cv.load_escores(name_dataset, classifier,
                                      1)  # test score 0
            best_param_folds = cv.best_param_folds_no_frequency(
                escores, 0, metric)  # best score per fold
            estimator.set_params(**best_param_folds)
        estimator.fit(x_train, y_train)

    y_pred = estimator.predict(x_test)
    cv.save_dict_list([y_test], [y_pred],
                      'y_pred/' + name_dataset + "_" + classifier + "_" +
                      metric + "_" + cv.name_file(name_test))
示例#5
0
def stacking_ensemble(X, y):
    cnb1 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', ComplementNB(alpha=0.347))])
    cnb2 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', ComplementNB(alpha=0.347))])
    rf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf',
         RandomForestClassifier(n_estimators=1000, max_depth=14, n_jobs=-1)),
    ])
    knn = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', KNeighborsClassifier(n_neighbors=100, n_jobs=-1)),
    ])
    xgb = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf',
         XGBClassifier(objective='multi:softmax', num_class=20, n_jobs=-1)),
    ])
    lr = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf',
         LogisticRegression(solver='lbfgs',
                            max_iter=1000,
                            multi_class='multinomial')),
    ])
    lgbm = Pipeline([('vect',
                      TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True)),
                     ('clf',
                      LGBMClassifier(objective='multiclass',
                                     reg_lambda=1e-6,
                                     num_leaves=150,
                                     n_estimators=200,
                                     learning_rate=0.07))])

    meta = OneVsRestClassifier(LinearSVC(class_weight='balanced'), n_jobs=-1)
    sclf = StackingClassifier(classifiers=[cnb1, cnb2, rf, knn, xgb, lgbm],
                              meta_classifier=meta,
                              use_probas=True)

    # ---------------------------- 4 Fold CV ---------------------------------
    scores = model_selection.cross_val_score(sclf,
                                             X,
                                             y,
                                             cv=4,
                                             scoring='accuracy',
                                             n_jobs=-1)
    print("Accuracy: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std()))
示例#6
0
def test_cnb():
    # Tests ComplementNB when alpha=1.0 for the toy example in Manning,
    # Raghavan, and Schuetze's "Introduction to Information Retrieval" book:
    # https://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html

    # Training data points are:
    # Chinese Beijing Chinese (class: China)
    # Chinese Chinese Shanghai (class: China)
    # Chinese Macao (class: China)
    # Tokyo Japan Chinese (class: Japan)

    # Features are Beijing, Chinese, Japan, Macao, Shanghai, and Tokyo.
    X = np.array([[1, 1, 0, 0, 0, 0], [0, 1, 0, 0, 1, 0], [0, 1, 0, 1, 0, 0],
                  [0, 1, 1, 0, 0, 1]])

    # Classes are China (0), Japan (1).
    Y = np.array([0, 0, 0, 1])

    # Check that weights are correct. See steps 4-6 in Table 4 of
    # Rennie et al. (2003).
    theta = np.array([[(0 + 1) / (3 + 6), (1 + 1) / (3 + 6), (1 + 1) / (3 + 6),
                       (0 + 1) / (3 + 6), (0 + 1) / (3 + 6),
                       (1 + 1) / (3 + 6)],
                      [(1 + 1) / (6 + 6), (3 + 1) / (6 + 6), (0 + 1) / (6 + 6),
                       (1 + 1) / (6 + 6), (1 + 1) / (6 + 6),
                       (0 + 1) / (6 + 6)]])

    weights = np.zeros(theta.shape)
    normed_weights = np.zeros(theta.shape)
    for i in range(2):
        weights[i] = -np.log(theta[i])
        normed_weights[i] = weights[i] / weights[i].sum()

    # Verify inputs are nonnegative.
    clf = ComplementNB(alpha=1.0)

    msg = re.escape('Negative values in data passed to ComplementNB (input X)')
    with pytest.raises(ValueError, match=msg):
        clf.fit(-X, Y)

    clf.fit(X, Y)

    # Check that counts/weights are correct.
    feature_count = np.array([[1, 3, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1]])
    assert_array_equal(clf.feature_count_, feature_count)
    class_count = np.array([3, 1])
    assert_array_equal(clf.class_count_, class_count)
    feature_all = np.array([1, 4, 1, 1, 1, 1])
    assert_array_equal(clf.feature_all_, feature_all)
    assert_array_almost_equal(clf.feature_log_prob_, weights)

    clf = ComplementNB(alpha=1.0, norm=True)
    clf.fit(X, Y)
    assert_array_almost_equal(clf.feature_log_prob_, normed_weights)
示例#7
0
def voting_ensemble(X, y):
    cnb1 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', ComplementNB(alpha=1.353))])
    cnb2 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', ComplementNB(alpha=0.347))])
    cnb3 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', ComplementNB(alpha=0.347, special=1))])
    cnb4 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', ComplementNB(alpha=0.347, special=2))])
    svc = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 2))),
        ('tfidf', TfidfTransformer()),
        ('clf',
         OneVsRestClassifier(LinearSVC(class_weight='balanced'), n_jobs=-1)),
    ])
    lr = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                   ('clf',
                    LogisticRegression(solver='lbfgs',
                                       max_iter=1000,
                                       multi_class='multinomial'))])
    rf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf',
         RandomForestClassifier(n_estimators=1000,
                                max_depth=14,
                                verbose=1,
                                random_state=0,
                                n_jobs=-1)),
    ])
    knn = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', KNeighborsClassifier(n_neighbors=100, n_jobs=-1)),
    ])

    # add base classifiers to test synergies
    model = VotingClassifier(estimators=[('cnb2', cnb2), ('svc', svc),
                                         ('lr', rf)],
                             voting='hard',
                             n_jobs=-1)
    scores = model_selection.cross_val_score(model,
                                             X,
                                             y,
                                             cv=4,
                                             scoring='accuracy',
                                             n_jobs=-1)
    print("Accuracy: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std()))
 def classification(self, x_train: np.array,
                    y_train: np.array) -> ComplementNB():
     """This returns the ComplementNB Classification Model
     
     Arguments:
         x_train {np.array} -- The train data x
         y_train {np.array} -- The train data y
     
     Returns:
         ComplementNB -- It is a KNeighborsClassifier model
     """
     clf = ComplementNB()
     clf.fit(x_train, y_train)
     return clf
示例#9
0
def score_test(train_data, test_data, part, save_root):
    model_dict = {
        'GaussianNB': GaussianNB(),
        'MultinomialNB': MultinomialNB(),
        'BernoulliNB': BernoulliNB(),
        'ComplementNB': ComplementNB()
    }

    train_texts = list(train_data[part])
    train_labels = list(train_data['gfi_label'])
    vectorizer = CountVectorizer(max_features=10000,
                                 min_df=5,
                                 stop_words='english').fit(train_texts)

    test_ids = list(test_data['id'])
    test_texts = list(test_data[part])

    score_dict = {}
    for name in model_dict:
        score_dict[name] = get_probs(copy.deepcopy(model_dict[name]),
                                     vectorizer, train_texts, train_labels,
                                     test_texts)

    score = pd.DataFrame()
    score['id'] = test_ids
    for name in score_dict:
        probs = score_dict[name]
        score[name + '_0'] = [proba[0] for proba in probs]
        score[name + '_1'] = [proba[1] for proba in probs]
    score.to_csv(os.path.join(save_root, 'test.csv'), index=False)
def train(datafile=paths.get_dataset_path(name),
          model_file=paths.get_model_path(name)
          ):  #settings.heading_classification_model_file):
    data = pd.read_csv(datafile)
    X, Y = data_prep(data, y=True)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)
    clf = Pipeline([
        ('tfidf', TfidfVectorizer(
            analyzer='word',
            ngram_range=(1, 2))),  #(token_pattern=r'([a-zA-Z]|[0-9])+')),
        ('clf', ComplementNB(norm=True))
    ])

    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    #weights = eli5.formatters.as_dataframe.explain_weights_df(clf, feature_names=clf['tfidf'].get_feature_names(), top=10, target_names=y_test)
    #print(weights)
    #prediction = eli5.formatters.as_dataframe.explain_prediction_df(clf, X_test[0], feature_names=clf['tfidf'].get_feature_names(), target_names=y_test)
    #print(prediction)

    accuracy = accuracy_score(y_test, y_pred)
    print(accuracy)
    report = classification_report(Y, clf.predict(X))
    print(report)
    with open(paths.result_path + name + '_CNB_report.txt', "w") as r:
        r.write(report)
    with open(model_file, "wb") as file:
        pickle.dump(clf, file)
示例#11
0
def pickling():
    '''
    Creates and pickles both the vectorizer and model for use in prediction.

    Parameters
    ----------
    None

    Returns
    ----------
    None
    '''

    wrangler = Data_Handler('data/cleaned_data.csv')
    stops = wrangler.stop_words
    df = wrangler.get_top_num(15)
    X = df['description']
    y = df['variety']

    vecto = TfidfVectorizer(stop_words=stops)
    X = vecto.fit_transform(df['description'])
    f = open('pickles/text_vec.pkl', 'wb')
    pickle.dump(vecto, f)

    model = ComplementNB()
    model.fit(X, y)
    m = open('pickles/model.pkl', 'wb')
    pickle.dump(model, m)
示例#12
0
def get_classifier(vocabulary):
    '''
    需要将抽象的句子分类到某一个模板,这里是训练分类器
    '''

    # 准备数据集
    x_train = []
    y_train = []

    root = "./Qdata/question/"
    filenames = [
        filename for filename in os.listdir(root) if filename[0] == "【"
    ]
    for filename in filenames:
        label = int(filename[filename.index("【") + 1:filename.index("】")])
        with open(root + filename, "r", encoding="utf-8") as f:
            sen_list = [line.strip() for line in f.readlines()]
            x_train += sen_list
            y_train += [label] * len(sen_list)

    x_train_array = np.zeros((len(x_train), len(vocabulary)))
    for row, sentence in enumerate(x_train):
        for col, voc in enumerate(vocabulary):
            if voc in sentence:
                x_train_array[row, col] = 1

    classifier = ComplementNB()
    classifier.fit(x_train_array, y_train)

    return classifier
示例#13
0
def naive_bayes(classifier, data, labels):
    x_train, x_test, y_train, y_test = train_test_split(data,
                                                        labels,
                                                        test_size=0.1)
    print(f"\nTraining {classifier} NB classifier... ")
    if classifier == "gaussian":
        from sklearn.naive_bayes import GaussianNB
        nb = GaussianNB()
    elif classifier == "multinomial":
        from sklearn.naive_bayes import MultinomialNB
        nb = MultinomialNB()
    elif classifier == "complement":
        from sklearn.naive_bayes import ComplementNB
        nb = ComplementNB()
    elif classifier == "bernoulli":
        from sklearn.naive_bayes import BernoulliNB
        nb = BernoulliNB()
    else:
        return

    # 5-fold
    print("5-fold accuracy:", get_k_fold_accuracy(nb, 10, data, labels))

    # LOOCV
    # print("LOOCV accuracy:", get_loocv_accuracy(nb,data,labels))

    # Train model with 80% data
    nb.fit(x_train, y_train)
    print(f"\nTesting {classifier} NB classifier... ")

    # get confusion matrix
    y_pred = nb.predict(x_test)
    print_confusion_matrix(y_test, y_pred)

    return nb
 def allModel(self):
     #instancia os 4 modelos de naive bayes definidos pelo sklearn
     model1 = GaussianNB()
     model2 = BernoulliNB(binarize = True)
     model3 = MultinomialNB()
     model4 = ComplementNB()
     self.model = [model1, model2, model3, model4]
def _complementnb(*,
                  train,
                  test,
                  x_predict=None,
                  metrics,
                  alpha=1.0,
                  fit_prior=True,
                  class_prior=None,
                  norm=False):
    """For for info visit : 
        https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.ComplementNB.html#sklearn.naive_bayes.ComplementNB
    """

    model = ComplementNB(alpha=alpha,
                         fit_prior=fit_prior,
                         class_prior=class_prior,
                         norm=norm)
    model.fit(train[0], train[1])
    model_name = 'ComplementNB'
    y_hat = model.predict(test[0])

    if metrics == 'f1_score':
        accuracy = f1_score(test[1], y_hat)
    if metrics == 'jaccard_score':
        accuracy = jaccard_score(test[1], y_hat)
    if metrics == 'accuracy_score':
        accuracy = accuracy_score(test[1], y_hat)

    if x_predict is None:
        return (model_name, accuracy, None)

    y_predict = model.predict(x_predict)
    return (model_name, accuracy, y_predict)
示例#16
0
    def buildReturnModel(self):
        model = None

        if self.flavor:
            if self.exp_type == 'classification':
                if self.flavor == 'Bernoulli':
                    model = BernoulliNB(**self.default_args)
                elif self.flavor == 'Categorical':
                    model = CategoricalNB(**self.default_args)
                elif self.flavor == 'Complement':
                    model = ComplementNB(**self.default_args)
                elif self.flavor == 'Gaussian':
                    model = GaussianNB(**self.default_args)
                elif self.flavor == 'Multinomial':
                    model = MultinomialNB(**self.default_args)
            else:
                raise ValueError(
                    'Naive bayes can only be used for classification problems!'
                )
        else:
            raise ValueError(
                'cannot build model because the flavor of Naive Bayes is unknown!'
            )

        return model
示例#17
0
def main():
    start = time.time()

    load_stop_words('stop_words.txt')
    train_df = load_train(train_file_path)
    X = train_df.iloc[:, 0].values
    Y = train_df.iloc[:, 1].values

    text_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', ComplementNB(alpha=0.5))])

    i = 0
    kf = KFold(n_splits=NFOLDS, shuffle=True)
    for train_index, test_index in kf.split(X):
        i += 1
        print(
            "\n************************************* Running fold: %d/%d *****************************************\n"
            % (i, NFOLDS))

        text_clf.fit(X[train_index], Y[train_index])

        accuracy = text_clf.score(X[train_index], Y[train_index])
        val_accuracy = text_clf.score(X[test_index], Y[test_index])
        y_pred = text_clf.predict(X[test_index])
        cm = confusion_matrix(Y[test_index], y_pred)
        print("accuracy: %f val_accuracy: %f\n" % (accuracy, val_accuracy))
        print(cm)

    end = time.time()
    print('Model completed in %d seconds' % (end - start))
示例#18
0
def findBestFitCluster(orphanCorpus, corpusCluster={}):
    """
    Given a set of questions without a cluster and a set of other clusters, find the best cluster to put the orphaned questions
    Parameters:
        orphanCorpus (tagged_question_corpus.TaggedQuestionCorpus): corpus of the questions without a cluster.
        corpusCluster ({tagged_question_corpus.TaggedQuestionCorpus}): Object containing different clusters and their corpuses

    Returns:
        xxx
    """

    # corpusCluster = {
    #     "questions": [ 'and the moon too guys', 'lets show some or a lot of love for the moon!!' ],
    #     "question_vectors": [[], []],
    #     "clusterIds": [ '4', '4' ]
    # }

    # orphanCorpus = [ {
    #         "id": 11, "question": 'Another one about the sun?', "question_vector": []
    #     },
    #     {
    #         "id": 33,
    #         "question": 'What is the distance from the sun though?', "question_vector": [] },
    #     {
    #         "id": 37,
    #         "question": 'what\'s the changing factors of the sun and moon together?', "question_vector": []
    # } ]

    # Fit the Naive bayes model on existing clusters
    clf = ComplementNB()
    clf.fit(corpusCluster["question_vectors"], corpusCluster["clusterIds"])

    predictions = clf.predict_proba(
        [doc["question_vector"] for doc in orphanCorpus])
def ComplementNB_classification(train,
                                test,
                                train_labels,
                                test_labels,
                                res={}):
    """

    :param train: training data, iterable/list
    :param test: testing data, iterable/list
    :param train_labels: training labels, iterable/list
    :param test_labels: testing labels, iterable/list
    :return: / --> Saves data in folder "Results"
    """
    print("Classifying with Complement Nive Bayes...")

    complNB = ComplementNB()
    complNB.fit(train, train_labels)

    prediction = complNB.predict(test)
    utils.report_and_confmat(test_labels, prediction, "ComplementNB")
    score = complNB.score(test, test_labels)

    res["ComplementNB"] = {
        "model": complNB,
        "accuracy": score,
        "name": "ComplementNB"
    }
    print("Complement ended...")
    return score, complNB
示例#20
0
def main():
    # Iris or breast cancer dataset can be used too
    x, y = datasets.load_wine(return_X_y=True)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=2405)

    # Multinomial Naive Bayes
    MNB = MultinomialNB()
    MNB.fit(x_train, y_train)
    mnb_accuracy = MNB.score(x_test, y_test)
    print(f"MultinomialNB accuracy is {round(mnb_accuracy, 4)}")

    # Gaussian Naive Bayes
    GNB = GaussianNB()
    GNB.fit(x_train, y_train)
    gnb_accuracy = GNB.score(x_test, y_test)
    print(f"GaussianNB accuracy is {round(gnb_accuracy, 4)}")

    # Complement Naive Bayes
    CNB = ComplementNB()
    CNB.fit(x_train, y_train)
    cnb_accuracy = CNB.score(x_test, y_test)
    print(f"ComplementNB accuracy is {round(cnb_accuracy, 4)}")
示例#21
0
def get_optimal_values_ComplementNB(x_train, y_train, x_val, y_val):
    alphas = [x / 10 for x in range(0, 11)]
    fit_priors = [True, False]
    norms = [True, False]
    max_score = 0
    optimal_fit_prior = True
    optimal_alpha = 1.0
    optiomal_norm = False

    # Evaluamos para escoger el mejor parámetro
    for alpha in alphas:
        for fit_prior in fit_priors:
            for norm in norms:
                naive = ComplementNB(alpha=alpha,
                                     fit_prior=fit_prior,
                                     norm=norm)
                naive.fit(x_train, y_train)
                y_pred = naive.predict(x_val)
                if max_score < accuracy_score(y_val, y_pred) * 100:
                    optimal_alpha = alpha
                    optimal_fit_prior = fit_prior
                    optiomal_norm = norm
                    max_score = accuracy_score(y_val, y_pred) * 100
    print(max_score, optimal_alpha, optimal_fit_prior, optiomal_norm)
    return max_score, optimal_alpha, optimal_fit_prior, optiomal_norm
示例#22
0
def TextClassifier1(train_feature_list, test_feature_list, train_class_list,
                    test_class_list):
    classifier = ComplementNB().fit(train_feature_list, train_class_list)
    test_accuracy = classifier.score(test_feature_list, test_class_list)
    # v = classifier.predict_proba(test_feature_list)
    # print(v)
    return test_accuracy
示例#23
0
def realizar_treinamento(registros_de_treino, vetorizador):
    treino_comentarios = [
        registro_treino[0] for registro_treino in registros_de_treino
    ]
    treino_respostas = [
        registro_treino[1] for registro_treino in registros_de_treino
    ]

    treino_comentarios = vetorizador.fit_transform(treino_comentarios)

    # modelo = BernoulliNB()
    # modelo = MultinomialNB()
    modelo = ComplementNB()
    modelo.fit(treino_comentarios, treino_respostas)

    # VALIDAÇÃO COM CROSS VALIDATION
    # cv = KFold(n_splits=200)
    # resultado = cross_val_predict(modelo, treino_comentarios, treino_respostas, cv=cv)
    # total = len(resultado)
    # acc = 0
    #
    # score = accuracy_score(treino_respostas, resultado)
    # print(score * 100)
    #
    # for i in range(0, total):
    #     if resultado[i] == treino_respostas[i]:
    #         acc += 1
    #
    # print(acc, total, acc/total * 100)
    #
    # print(metrics.classification_report(treino_respostas, resultado, [0, 1]))
    #
    # exit()

    return modelo
def create_nb_classifier_pipeline(n_features):
    classifier = Pipeline([
        ('features',
         FeatureUnion([
             ('journal_title',
              Pipeline([
                  ('colext', JournalTitleSelector('journal')),
                  ('tfidf',
                   TfidfVectorizer(ngram_range=(1, 3),
                                   min_df=0.0005,
                                   max_df=0.6,
                                   strip_accents='ascii')),
              ])),
             ('article_title',
              Pipeline([('colext', TitleSelector('title')),
                        ('tfidf',
                         TfidfVectorizer(ngram_range=(1, 3),
                                         min_df=0.001,
                                         max_df=0.6,
                                         strip_accents='ascii',
                                         sublinear_tf=True))])),
             ('article_abstract',
              Pipeline([('colext', AbstractSelector('abstract')),
                        ('tfidf',
                         TfidfVectorizer(ngram_range=(1, 3),
                                         min_df=0.001,
                                         max_df=0.6,
                                         strip_accents='ascii',
                                         sublinear_tf=True))])),
         ])), ('feature_selection', SelectKBest(chi2, k=n_features)),
        ('clf', ComplementNB())
    ])
    return classifier
示例#25
0
def BayesClassifier(dframe):
    #print(dframe.columns)
    dframe = dframe.drop(['x_7'], axis = 1)
    X = dframe[list(dframe.columns)[ : -1]][ : -20].to_numpy()
    y = dframe[list(dframe.columns)[-1 : ]][ : -20].to_numpy().reshape(len(dframe['y']) - 20, )
    X_validate = dframe[list(dframe.columns)[ : -1]][-20 : ].to_numpy()
    y_validate = dframe[list(dframe.columns)[-1 : ]][-20 : ].to_numpy()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.08, random_state = 16)
    model = ComplementNB()
    cv_gen = ShuffleSplit(n_splits = 10, test_size = 0.25, random_state = 0)
    model_gs = GridSearchCV(model,
                            {
                                'norm': [True, False]
                            },
                            scoring = 'accuracy',
                            n_jobs = -1,
                            cv = cv_gen
                            )
    model_gs.fit(X, y)
    print(model_gs.best_params_)
    print("Accuracy score", model_gs.best_score_)
    #print(X_validate.shape)
    for i in range(X_validate.shape[0]):
        prediction = model_gs.best_estimator_.predict(X_validate[i].reshape(1, X_validate.shape[1]))
        print("Predicted:", prediction)
        print("Real:", y_validate[i])
        print("")
    return 0
def _generate_title_model(data_train, labels_train, output_file):
    print('Training reference title model for recommendation...')
    title_clf = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
                          ('clf', ComplementNB())])
    title_clf.fit(data_train, labels_train)
    joblib.dump(title_clf, output_file, compress='zlib')
    print('Model file {} saved.'.format(output_file))
 def train(self):
     """ """
     print("=== Training on %d papers ===" % (len(self.train_papers)))
     if self.distribution == "Multinomial":
         self.clf = MultinomialNB()
         self.clf.fit(self.train_X, self.train_y)
     elif self.distribution == "Gaussian":
         self.clf = GaussianNB()
         self.clf.fit(self.train_X, self.train_y)
     elif self.distribution == "Complement":
         self.clf = ComplementNB()
         self.clf.fit(self.train_X, self.train_y)
     elif self.distribution == "Bernoulli":
         self.clf = BernoulliNB()
         self.clf.fit(self.train_X, self.train_y)
     elif self.distribution == "Deep":
         self.clf = self.deepCNN()
         callback = ModelCheckpoint(
             "../models/weights.{epoch:02d}-{val_acc:.5f}.hdf5",
             monitor='val_acc',
             verbose=0,
             save_best_only=True,
             save_weights_only=True,
             mode='max',
             period=1)
         self.clf.fit(x=np.expand_dims(self.deep_train_X, axis=2),
                      y=self.deep_train_y,
                      batch_size=self.batch_size,
                      epochs=self.epochs,
                      validation_split=0.1,
                      callbacks=[callback])
示例#28
0
    def __init__(self, col_stats, data_type=None):
        """
        Chose the algorithm to use for the rest of the model
        As of right now we go with ComplementNB
        """
        # <--- Pick one of the 3
        self._probabilistic_model = ComplementNB(alpha=self._smoothing_factor)
        #, class_prior=[0.5,0.5]
        #self._probabilistic_model = GaussianNB(var_smoothing=1)
        #self._probabilistic_model = MultinomialNB(alpha=self._smoothing_factor)
        self.X_buff = []
        self.Y_buff = []

        self.col_stats = col_stats

        if 'percentage_buckets' in col_stats:
            self.buckets = col_stats['percentage_buckets']
            self.bucket_keys = [i for i in range(len(self.buckets))]
        else:
            self.buckets = None

        self.data_type = col_stats['data_type']

        self.bucket_accuracy = {

        }
示例#29
0
def result_for_classifiers(data, categories_list):
    # NAIVE BAYES
    classifiers(MultinomialNB(alpha=0.05), "Naive Bayes", data,
                categories_list, False)
    # COMPLEMENT NAIVE BAYES
    classifiers(ComplementNB(alpha=0.05), "Complement Naive Bayes", data,
                categories_list, False)
    # GAUSSIAN NAIVE BAYES
    classifiers(GaussianNB(), "Gaussian Naive Bayes", data, categories_list,
                False)
    # RANDOM FOREST
    classifiers(RandomForestClassifier(), "Random Forest", data,
                categories_list, False)
    # ADABOOST
    classifiers(AdaBoostClassifier(), "AdaBoost", data, categories_list, False)
    # KNN
    classifiers(KNeighborsClassifier(), "KNN", data, categories_list, False)
    # SVM
    classifiers(SVC(), "SVM", data, categories_list, False)
    # DECISION TREES
    classifiers(DecisionTreeClassifier(), "Decision Trees", data,
                categories_list, False)
    # NEURAL NETWORK
    classifiers(MLPClassifier(hidden_layer_sizes=2, random_state=0),
                "Multilayer Perceptron", data, categories_list, True)
示例#30
0
 def _fit(self, X, y, reset):
     self._model_clf = ComplementNB(
     ) if reset == True or self._model_clf == None else self._model_clf
     train_result = self._model_clf.fit(X, y)
     joblib.dump(self._model_clf, self.__model_path)
     self._dumpmodel()
     return train_result