train_label_fn = 'train-labels-100.txt'
test_data_fn = 'test-features.txt'
test_label_fn = 'test-labels.txt'

(train_data, train_label)  = read_data(train_data_fn, train_label_fn)
(test_data, test_label)  = read_data(test_data_fn, test_label_fn)
clf = MultinomialNB()
clf.fit(train_data, train_label)
y_pred = clf.predict(test_data)
print('Training size = %d, accuracy = %.2f%%' % \
      (train_data.shape[0],accuracy_score(test_label, y_pred)*100))
    

train_data_fn = 'train-features-50.txt'
train_label_fn = 'train-labels-50.txt'
test_data_fn = 'test-features.txt'
test_label_fn = 'test-labels.txt'

(train_data, train_label)  = read_data(train_data_fn, train_label_fn)
(test_data, test_label)  = read_data(test_data_fn, test_label_fn)
clf = MultinomialNB()
clf.fit(train_data, train_label)
y_pred = clf.predict(test_data)
print('Training size = %d, accuracy = %.2f%%' % \
      (train_data.shape[0],accuracy_score(test_label, y_pred)*100))
    
clf = BernoulliNB(binarize = .5)
clf.fit(train_data, train_label)
y_pred = clf.predict(test_data)
print('Training size = %d, accuracy = %.2f%%' % \
      (train_data.shape[0],accuracy_score(test_label, y_pred)*100))
예제 #2
0
def main(args):

    with tf.Graph().as_default():

        with tf.Session() as sess:

            np.random.seed(seed=args.seed)

            if args.use_split_dataset:
                dataset_tmp = facenet.get_dataset(args.data_dir)
                train_set, test_set = split_dataset(
                    dataset_tmp, args.min_nrof_images_per_class,
                    args.nrof_train_images_per_class)
                if (args.mode == 'TRAIN'):
                    dataset = train_set
                elif (args.mode == 'CLASSIFY'):
                    dataset = test_set
            else:
                dataset = facenet.get_dataset(args.data_dir)

            # Check that there are at least one training image per class
            for cls in dataset:
                assert (
                    len(cls.image_paths) > 0,
                    'There must be at least one image for each class in the dataset'
                )

            paths, labels = facenet.get_image_paths_and_labels(dataset)

            print('Number of classes: %d' % len(dataset))
            print('Number of images: %d' % len(paths))

            # Load the model
            print('Loading feature extraction model')
            facenet.load_model(args.model)

            # Get input and output tensors
            images_placeholder = tf.get_default_graph().get_tensor_by_name(
                "input:0")
            embeddings = tf.get_default_graph().get_tensor_by_name(
                "embeddings:0")
            phase_train_placeholder = tf.get_default_graph(
            ).get_tensor_by_name("phase_train:0")
            embedding_size = embeddings.get_shape()[1]

            # Run forward pass to calculate embeddings
            print('Calculating features for images')
            nrof_images = len(paths)
            nrof_batches_per_epoch = int(
                math.ceil(1.0 * nrof_images / args.batch_size))
            emb_array = np.zeros((nrof_images, embedding_size))
            for i in range(nrof_batches_per_epoch):
                start_index = i * args.batch_size
                end_index = min((i + 1) * args.batch_size, nrof_images)
                paths_batch = paths[start_index:end_index]
                images = facenet.load_data(paths_batch, False, False,
                                           args.image_size)
                feed_dict = {
                    images_placeholder: images,
                    phase_train_placeholder: False
                }
                emb_array[start_index:end_index, :] = sess.run(
                    embeddings, feed_dict=feed_dict)

            classifier_filename_exp = os.path.expanduser(
                args.classifier_filename)

            if (args.mode == 'TRAIN'):
                # Train classifier
                print('Training classifier')
                #                 model = GaussianNB()     # 0 or 1값만 반환
                model = BernoulliNB()
                #                 model = SVC(kernel='linear', probability=True)

                model.fit(emb_array, labels)

                # Create a list of class names
                class_names = [cls.name.replace('_', ' ') for cls in dataset]

                # Saving classifier model
                with open(classifier_filename_exp, 'wb') as outfile:
                    pickle.dump((model, class_names), outfile)
                print('Saved classifier model to file "%s"' %
                      classifier_filename_exp)

            elif (args.mode == 'CLASSIFY'):
                # Classify images
                print('Testing classifier')
                with open(classifier_filename_exp, 'rb') as infile:
                    (model, class_names) = pickle.load(infile)

                print('Loaded classifier model from file "%s"' %
                      classifier_filename_exp)

                predictions = model.predict_proba(emb_array)
                best_class_indices = np.argmax(predictions, axis=1)
                best_class_probabilities = predictions[
                    np.arange(len(best_class_indices)), best_class_indices]

                for i in range(len(best_class_indices)):
                    print('%4d  %s: %.3f' %
                          (i, class_names[best_class_indices[i]],
                           best_class_probabilities[i]))

                accuracy = np.mean(np.equal(best_class_indices, labels))
                print('Accuracy: %.3f' % accuracy)
예제 #3
0
)

# Train NearestCentroid without threshold
benchmark(
    NearestCentroid(),
    name="NearestCentroid (aka Rocchio classifier)"
)

# Train sparse Naive Bayes classifiers
benchmark(
    MultinomialNB(alpha=.01),
    name="Naive Bayes MultinomialNB"
)

benchmark(
    BernoulliNB(alpha=.01),
    name="Naive Bayes BernoulliNB"
)

benchmark(
    ComplementNB(alpha=.1),
    name="Naive Bayes ComplementNB"
)

# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
benchmark(
    Pipeline([
        ('feature_selection',
            SelectFromModel(
                LinearSVC(
예제 #4
0
predicted = grid_search.predict(docs_new)
print(predicted)
for doc, category in zip(['hit1', 'hit2', 'miss1', 'miss2'], predicted):
    print('%r => %s' % (doc, dataset.target_names[category]))

# In[105]:

from sklearn.naive_bayes import BernoulliNB

pipeline = Pipeline([
    ('vect',
     TfidfVectorizer(ngram_range=(1, 2),
                     min_df=3,
                     max_df=0.95,
                     stop_words='english')),
    ('clf', BernoulliNB()),
])
pipeline.fit(docs_train, y_train)

# In[106]:

y_predicted = pipeline.predict(docs_test)
print(
    metrics.classification_report(y_test,
                                  y_predicted,
                                  target_names=dataset.target_names))

# In[107]:

cm = metrics.confusion_matrix(y_test, y_predicted)
print(cm)
 def __init__(self, **kwargs):
     super().__init__()
     self.classifier = BernoulliNB()
     self.kwargs = kwargs
예제 #6
0
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from sklearn import svm
import numpy as np

data = [[0, 0, 0], [0, 0, 0], [0, 0, 0], [1, 0, 0], [1, 0, 1], [1, 1, 1],
        [1, 1, 1], [1, 1, 0], [0, 1, 1], [0, 0, 1]]
X = np.array(data)
Y = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1, 0])

test = np.array([[0, 1, 0]])

clf = BernoulliNB(alpha=0.000001)
#clf.fit(X, Y)
#print("Naive Bayes:", clf.predict_proba(test))

clf2 = DecisionTreeClassifier()
#clf2.fit(X, Y)
#print("DT:", clf2.predict_proba(test))

clf3 = KNeighborsClassifier()
#clf3.fit(X, Y)
#print("KNN", clf3.predict_proba(test))

clf4 = MLPClassifier(max_iter=1000)
#clf4.fit(X, Y)
#print("MLP", clf4.predict_proba(test))
예제 #7
0
import numpy as np
import pandas as pd
from sklearn.kernel_approximation import RBFSampler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import make_pipeline
from tpot.builtins import DatasetSelector

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=66)

# Average CV score on the training set was:0.6476751946607341
exported_pipeline = make_pipeline(
    DatasetSelector(sel_subset=14, subset_list="subsets.csv"),
    RBFSampler(gamma=0.55),
    BernoulliNB(alpha=0.01, fit_prior=True)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
예제 #8
0
##datasets with a validation set
X_train2 = full_df[:1120000, :]
X_valid = full_df[1120000:1600000, :]

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y_train1 = le.fit_transform(full_data['Sentiment'])
y_train2 = le.transform(full_data['Sentiment'][:1120000])
y_valid = le.transform(full_data['Sentiment'][1120000:])
y_test = le.transform(test_data_pos_neg['Sentiment'])

######Try Binomial Naive Bayes Model without word stemming######
from sklearn.naive_bayes import BernoulliNB
##Convert word frequency matrix into binary matrix
X_train1_bin = X_train1.copy()
X_train1_bin[X_train1_bin > 0] = 1

clf_ber_bayes = BernoulliNB()
clf_ber_bayes.fit(X_train1_bin, y_train1)

train_preds = clf_ber_bayes.predict(X_train1_bin)
accuracy_score(train_preds, y_train1)
#Convert test dataframe to binary
X_test_bin = X_test.copy()
X_test_bin[X_test_bin > 0] = 1

test_preds = clf_ber_bayes.predict(X_test_bin)
accuracy_score(y_test, test_preds)  ##84.12 % accuracy_score
print Matr.shape
Matr=Matr[1:]
print len(Yval)

a=1000
b=100000
prior1=(a+spamc-1)*1.0/(a+b+spamc+legitc-2)
prior2=(a+legitc-1)*1.0/(a+b+spamc+legitc-2)
#   y=beta.pdf(x, a, b)
from sklearn.metrics import precision_recall_curve
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.cross_validation import train_test_split
a_train, a_test, b_train, b_test = train_test_split(Matr, Yval, test_size=0.2, random_state=42)
clf = MultinomialNB(class_prior=[1,2])
clf2= BernoulliNB(class_prior=[prior1,prior2])
clf.fit(a_train, b_train)
clf2.fit(a_train, b_train)
Ax=clf.predict(a_test)
Bx=clf2.predict(a_test)
from sklearn.metrics import f1_score

#print f1_score(b_test, Ax, average='macro')
print f1_score(b_test, Bx, average='macro')

import matplotlib.pyplot as plt

precision, recall, _ = precision_recall_curve(b_test, Bx)

plt.step(recall, precision, color='b', alpha=0.2,where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2,color='b')
예제 #10
0
def main():
    show_plots = False #set to True to show plots, False to not show plots

    #read categories from arguments. e.g. "python3 test.py Comedy Drama Documentary Horror"
    categories = []
    for arg in sys.argv[1:]:
        categories.append(arg)

    X, y, files_used = read_files(categories)

    try:
        high_info_words = high_information_words(X, y)

        X_high_info = []
        for bag in X:
            new_bag = []
            for words in bag:
                if words in high_info_words:
                    new_bag.append(words)
            X_high_info.append(new_bag)
    except ZeroDivisionError:
        print("Not enough information too get high information words, please try again with more files.", file=sys.stderr)
        X_high_info = X

    X_wpm = wpm(files_used, categories, show_plots)
    X_dpm = dpm(files_used, categories, show_plots)
    X_wd = word_distribution(files_used, categories)

    doc2vec_model = Doc2Vec.load("d2v_150.model")
    #doc2vec_model = Doc2Vec.load("d2v_400.model")

    #Reason I don't infer the vector is that I used the data already while training the vector model (with tagged docoments), so I can just retrieve the data
    X_d2v = [doc2vec_model.docvecs[str(i)] for i in range(len(X))]
    #X_d2v = [doc2vec_model.infer_vector(to_list(str(i))) for i in X] 

    X = [(str(x), str(x_high), wpm, dpm, wd, d2v) for x, x_high, wpm, dpm, wd, d2v in zip(X, X_high_info, X_wpm, X_dpm, X_wd, X_d2v)]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

    clfs = [
        SVC(C=10, cache_size=500, class_weight=None, coef0=0.0, #parameters found using grid_search.py
        decision_function_shape=None, degree=3, gamma=0.0001, kernel='linear',
        max_iter=100000, probability=False, random_state=None, shrinking=True,
        tol=0.001, verbose=False),
        MultinomialNB(alpha=1.0),
        BernoulliNB(),
    ]

    pipeline = Pipeline([
        # Extract the features
        ('features', FeaturesExtractor()),

        # Use FeatureUnion to combine the features from subject and body
        ('union', FeatureUnion(
            transformer_list=[
                #Pipeline bag-of-words model 
                ('text', Pipeline([
                    ('selector', ItemSelector(key='text')),
                    ('tfidf', TfidfVectorizer(sublinear_tf=True, binary=True, norm='l2', ngram_range=(1,3))),
                    #('chi-square', SelectKBest(chi2, 300)),
                ])),

                #Pipeline for high info words bag-of-words model 
                ('text_high', Pipeline([
                    ('selector', ItemSelector(key='text_high')),
                    ('tfidf', TfidfVectorizer(sublinear_tf=True, norm='l2')),
                ])),

                #Pipeline for wpm feature
                ('wpm', Pipeline([
                    ('selector', ItemSelector(key='wpm')),
                    ('scaler', MinMaxScaler()),
                ])),

                #Pipeline for dpm feature
                ('dpm', Pipeline([
                    ('selector', ItemSelector(key='dpm')),
                    ('scaler', MinMaxScaler()),
                ])),

                #Pipeline for wd feature
                ('wd', Pipeline([
                    ('selector', ItemSelector(key='wd')),
                    ('scaler', MinMaxScaler()),
                ])),

                #Pipeline for d2v feature
                ('d2v', Pipeline([
                    ('selector', ItemSelector(key='d2v')),
                    ('scaler', MinMaxScaler()),
                ])),

                #Pipeline for POS tag features
                # ('pos', Pipeline([
                #     ('selector', ItemSelector(key='pos')),
                #     ('words', TfidfVectorizer(sublinear_tf=True, binary=True, norm='l2', ngram_range=(1,3)))
                # ])),

            ],

            # weight components in FeatureUnion
            transformer_weights={ 
                'text': 0.2,
                'text_high' : 1,
                'wpm': 0,
                'dpm': 0.2,
                'wd': 0,
                'd2v': 0,
                #'pos': 0,
            },
        )),

        # Use a classifier on the combined features
        ('classifier', clfs[0]),
    ])

    train(pipeline, X_train, y_train, categories, show_plots)

    final_pred = pipeline.predict(X_test)
    print("\nScores on test set:\n")
    print(metrics.accuracy_score(y_test, final_pred))
    print(metrics.classification_report(y_test, final_pred, digits=3))

    confusion_m = metrics.confusion_matrix(y_test, final_pred, labels=categories)
    plt.figure(figsize = (16, 9), dpi=150)
    sn.set(font_scale=1.4) #label size
    hm = sn.heatmap(confusion_m, annot=True, fmt='g', annot_kws={"size": 16}) #font size
    hm.set(xticklabels = categories, yticklabels = categories)
    plt.title(str(pipeline.named_steps['classifier']).split("(")[0] + ' Confusion Matrix')
    if show_plots:
        plt.show()
    hm.figure.savefig(str(pipeline.named_steps['classifier']).split("(")[0] + '_confusion_matrix_test' + '.png', figsize = (16, 9), dpi=150)
    plt.close()
예제 #11
0
classifiers.append(GradientBoostingClassifier(random_state=random_state))
classifiers.append(RandomForestClassifier(random_state=random_state))

#Gaussian process
classifiers.append(GaussianProcessClassifier(random_state=random_state))

#Generalized linear models
classifiers.append(LogisticRegressionCV(random_state=random_state))
classifiers.append(PassiveAggressiveClassifier(random_state=random_state))
classifiers.append(RidgeClassifierCV())
classifiers.append(SGDClassifier(random_state=random_state))
classifiers.append(Perceptron(random_state=random_state))
classifiers.append(MLPClassifier(random_state=random_state))

#Navies Bayes
classifiers.append(BernoulliNB())
classifiers.append(GaussianNB())

#Nearest Neighbors
classifiers.append(KNeighborsClassifier())

#Discrimnant analysis
classifiers.append(LinearDiscriminantAnalysis())

#Support vector machine
classifiers.append(SVC(random_state=random_state, probability=True))
classifiers.append(NuSVC(random_state=random_state, probability=True))
classifiers.append(LinearSVC(random_state=random_state))

#Trees
classifiers.append(DecisionTreeClassifier(random_state=random_state))
예제 #12
0
# [5] Результат в процентах

from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    results = []
    for clf, name in [
        (BernoulliNB(alpha=0.4), 'Native Bayes'),
        (LinearSVC(C=9), 'SVC'),
        (
            DecisionTreeClassifier(max_depth=26),
            'DecisionTreeClassifier',
        ),
            # (LogisticRegression(C=12), 'LogisticRegression'),
            # (RandomForestClassifier(max_depth=2, random_state=0), 'RandomForest'),
        (KNeighborsClassifier(n_neighbors=13), 'KNN')
    ]:
        #     Y_train.reshape(Y_train.shape[0],)
        #     Y_test.reshape(Y_test.shape[0])
        clf.fit(X_train, Y_train)

        predictions = clf.predict(X_train)
        training_accuracy = accuracy_score(predictions, Y_train)
예제 #13
0
from scorer_semeval18 import main as eval

tokenized_tweets = pickle.load(open(TOK_TWEETS_PATH, 'rb'))
print('loaded tweets')

data_matrix = construct_data_matrix(tokenized_tweets)
print('constructed data matrix')
print('Dim:', data_matrix.shape)
print('Density:', np.count_nonzero(data_matrix) / np.size(data_matrix))

labels = np.asarray(open(CLEAN_LABELS_PATH).read().splitlines())
data_train, data_test, labels_train, labels_test = split_data(
    data_matrix, labels)
print('split data')

bern = BernoulliNB()
bern.fit(data_train, labels_train)
print("\nbern", bern.score(data_test, labels_test))
eval(labels_test, bern.predict(data_test))

multi = MultinomialNB()
multi.fit(data_train + abs(np.min(data_train)), labels_train)
print("\nmulti", multi.score(data_test + abs(np.min(data_test)), labels_test))
eval(labels_test, multi.predict(data_test))

tree = DecisionTreeClassifier(max_depth=10)
tree.fit(data_train, labels_train)
print("\ntree", tree.score(data_test, labels_test))
eval(labels_test, tree.predict(data_test))

clf = RandomForestClassifier(max_depth=3)
예제 #14
0
파일: a1q3p3.py 프로젝트: mbenitah/comp550
def classify(X, y, clf_type='nbc'):
    """
    Preprocess the input documents to extract feature vector representations of
    them. Your features should be N-gram counts, for N<=2.

    1. Experiment with the complexity of the N-gram features (i.e., unigrams,
       or unigrams and bigrams): `gram_min` + `gram_max`
    2. Experiment with removing stop words. (see NLTK)
    3. Remove infrequently occurring words and bigrams as features. You may tune
       the threshold at which to remove infrequent words and bigrams.
    4. Search over hyperparameters for the three models (nb, svm, lr) to
       find the best performing model.

    All 4 of the above are done in the context of 10-fold cross validation on
    the data. On the training data, 3-fold cross validation is done to find the
    optimal hyperparameters (using randomized CV), which are then tested on
    held-out data.
    """

    if clf_type == 'nbc':
        clf = BernoulliNB()
        params = SETTINGS_NB
    elif clf_type == 'svc':
        clf = LinearSVC()
        params = SETTINGS_SVC
    elif clf_type == 'lrc':
        clf = LogisticRegression()
        params = SETTINGS_LR
    else:
        raise Exception('invalid clf {}: {nbc, svc, lrc}'.format(clf_type))

    # pipeline runs preprocessing and model during every CV loop
    pipe = Pipeline([
        ('pre', CountVectorizer()),
        ('clf', clf),
    ])

    model = RandomizedSearchCV(
        pipe, params, n_jobs=-1, n_iter=N_CV, cv=INNER, scoring='f1_macro'
    )

    results = {
        'test':  {'loss': [], 'accuracy': [], 'confusion': [], 'errors': []},
        'train': {'loss': [], 'accuracy': [], 'confusion': []},
        'cv': {}
    }

    kf = StratifiedKFold(n_splits=FOLDS, shuffle=True)

    for i, (train_idx, test_idx) in enumerate(kf.split(X, y)):
        print("[{}] {}/{}".format(clf_type, i+1, FOLDS))

        # split training and test sets
        X_train = X[train_idx]
        X_test = X[test_idx]
        y_train = y[train_idx]
        y_test = y[test_idx]

        # fit model
        model.fit(X_train, y_train)

        # save the best parameters from the inner-fold cross validation
        best_params = model.best_estimator_.get_params()
        for p in sorted(params.keys()):
            results['cv'][p] = best_params[p]

        # make predictions on train and test set
        y_test_pred = model.predict(X_test)
        y_train_pred = model.predict(X_train)

        # record some misclassified sentences
        idx_errors = np.where(y_test_pred != y_test)[0]
        np.random.shuffle(idx_errors)
        errors = X_test[idx_errors[:5]]
        results['test']['errors'].extend(errors)

        # store results
        results['test']['loss'].append(log_loss(y_test, y_test_pred))
        results['test']['accuracy'].append(accuracy_score(y_test, y_test_pred))
        results['test']['confusion'].append(confusion_matrix(y_test, y_test_pred))
        results['train']['loss'].append(log_loss(y_train, y_train_pred))
        results['train']['accuracy'].append(accuracy_score(y_train, y_train_pred))
        results['train']['confusion'].append(confusion_matrix(y_train, y_train_pred))

    return(results)
예제 #15
0
# vectorizer = TfidfVectorizer(use_idf=True, norm="l2")

X = vectorizer.fit_transform(all_reviews).toarray()
pos = len(pos_reviews)
neg = len(neg_reviews)
y = [1] * pos + [0] * neg
'''Splitting data into Training and Testing Sets'''
# divide data into 20% test set and 80% training set
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)
'''Training and testing the model'''
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

classifier = BernoulliNB(alpha=0.001)
classifier.fit(X_train, y_train)

# predict the sentiment for the documents in our test
y_pred = classifier.predict(X_test)
'''Check the accuracy of the model'''
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

# '''Saving and Loading the Model'''
# # save trained model for later use.
# with open('RF_classifier', 'wb') as picklefile:
#     pickle.dump(classifier, picklefile)
# # to load the model use
# with open('RF_classifier', 'rb') as training_model:
예제 #16
0
            votes.append(v)
        return str(mode(votes)[0])

    def confidence(self, features):
        votes =[]
        for c in self._classifiers:
            v = c.predict(features)
            votes.append(v)
        choice_votes = int(mode(votes)[1])
        conf = choice_votes / len(votes)
        return conf
    #def test_accuracy(self, x2,x3,x4,x5,x6, x7):
    #    average = mean([x2,x3,x4,x5,x6, x7])
    #    return average

BNB = BernoulliNB()
BNB.fit(tfidf_train, y_train)
pred = BNB.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
x2 = metrics.accuracy_score(y_test, pred)
print("BernoulliNB Naive Bayes Accuracy:   %0.3f" % score)
#cm = metrics.confusion_matrix(y_test, pred, labels=[0,1])
#plot_confusion_matrix(cm, classes=[0, 1])

save_classifier = open("Pickled/BernoulliNB.pickle", "wb")
pickle.dump(BNB, save_classifier)
save_classifier.close()

LR = LogisticRegression()
LR.fit(tfidf_train, y_train)
pred = LR.predict(tfidf_test)
예제 #17
0
 def __init__(self):
     self.name = "nb"
     self.model = BernoulliNB()
예제 #18
0
def classification_naive_bayes(X, Y, nome):
    nb_model = BernoulliNB()
    classification_model_cv(X, Y, nb_model, "Naive Bayes "+nome)
예제 #19
0

import numpy as np
X = np.random.randint(100, size=(10000, 100))
Y = np.random.randint(5, size=(10000, 1))

X.shape
Y.shape

from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(X, Y)

Z = np.random.randint(10, size=(1, 100))
print(clf.predict(Z))
예제 #20
0
def classification_voting(X,y, nome):
    clf2 = RandomForestClassifier(n_estimators=30, max_depth=None, min_samples_split=2, random_state=0)
    clf3 = BernoulliNB()
    eclf2 = VotingClassifier(estimators=[('rf', clf2), ('bnb', clf3)],voting = 'soft')
    classification_model_cv(X, y, eclf2, "Voting Model "+nome)
예제 #21
0
    df = pd.concat([df, dummy], axis=1)
    df = df.drop(columns=feat, axis=1)
    dummy = None

# split data to train, heldout, and test datasets
print('INFO: Spliting data into train/heldout/test datasets.')
x_train = np.array(df[df['data'] == 'T'].drop(columns=['data', 'result']))
y_train = np.array(df[df['data'] == 'T']['result'].astype('bool'))
x_valid = np.array(df[df['data'] == 'V'].drop(columns=['data', 'result']))
y_valid = np.array(df[df['data'] == 'V']['result'].astype('bool'))
x_hold = np.array(df[df['data'] == 'H'].drop(columns=['data', 'result']))
y_hold = np.array(df[df['data'] == 'H']['result'].astype('bool'))

# machine learning classification models
classif = [('Gaussian Naive Bayes', GaussianNB()),
           ('Bernoulli Naive Bayes', BernoulliNB()),
           ('COmplement Naive Bayes', ComplementNB()),
           ('Multinomial Naive Bayes', MultinomialNB()),
           ('LOGistic Regression',
            LogisticRegression(solver='liblinear',
                               multi_class='ovr',
                               penalty='l2',
                               random_state=24)),
           ('LOGistic Regression 2',
            LogisticRegression(solver='saga',
                               multi_class='ovr',
                               l1_ratio=0.3,
                               penalty='elasticnet',
                               max_iter=1000,
                               random_state=24)),
           ('LOGistic Regression 3',
예제 #22
0
        print("t = %d" % X_valid.shape[0])
        print("Num classes = %d" % len(np.unique(y)))

        model = RandomForestClassifier()
        model.fit(X, y)
        y_pred = model.predict(X_valid)
        v_error = np.mean(y_pred != y_valid)
        print("Random Forest (sklearn) validation error: %.3f" % v_error)

        model = NaiveBayes(num_classes=4, beta=1)
        model.fit(X, y)
        y_pred = model.predict(X_valid)
        v_error = np.mean(y_pred != y_valid)
        print("Naive Bayes (ours) validation error: %.3f" % v_error)

        model = BernoulliNB()
        model.fit(X, y)
        y_pred = model.predict(X_valid)
        v_error = np.mean(y_pred != y_valid)
        print("Naive Bayes (sklearn) validation error: %.3f" % v_error)

    elif question == '2':
        dataset = load_dataset('vowel.pkl')
        X = dataset['X']
        y = dataset['y']
        X_test = dataset['Xtest']
        y_test = dataset['ytest']
        print("\nn = %d, d = %d\n" % X.shape)

        def evaluate_model(model):
            model.fit(X,y)
예제 #23
0
print("Total sarcastic lines = " + str(sarcasm_size))
neutral_size = len(neutral)
print("Total non-sarcastic lines = " + str(neutral_size))

for i in range(0, sarcasm_size):
    labels.append(1)
for i in range(0, neutral_size):
    labels.append(0)
print(len(labels))

dataset = np.concatenate([sarcasm, neutral])
print("Total length of dataset = " + str(len(dataset)))

#Classify using Naive Bayes:
from sklearn.naive_bayes import BernoulliNB
vec, clf = TfidfVectorizer(min_df=5), BernoulliNB()
td_matrix = vec.fit_transform(dataset)
print("Shape of matrix = " + str(td_matrix.shape))
print("Length of the labels = " + str(len(labels)))
X_train, X_test, y_train, y_test = train_test_split(td_matrix,
                                                    labels,
                                                    test_size=0.2,
                                                    random_state=0)

clf.fit(X_train, y_train)
y_out = clf.predict(X_test)

print("Accuracy on held-out data: ",
      str(100 * accuracy_score(y_out, y_test))[0:5], "%\n")

#Accuracy on held-out data: MultinomialNB 83.79 %, BernoulliNB 84.49%, DecisionTree=84.40%, RandomForest=82.39%
예제 #24
0
classifier.show_most_informative_features(15)

save_classifier = open("pickled_algos/originalnaivebayes5k.pickle", "wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

MNB_classifer = SklearnClassifier(MultinomialNB())
MNB_classifer.train(training_set)
print("MNB Acc: " +
      str((nltk.classify.accuracy(MNB_classifer, testing_set)) * 100))

save_classifier = open("pickled_algos/MNB_classifier5k.pickle", "wb")
pickle.dump(MNB_classifer, save_classifier)
save_classifier.close()

BernoulliNB_classifer = SklearnClassifier(BernoulliNB())
BernoulliNB_classifer.train(training_set)
print("BernoulliNB Acc: " +
      str((nltk.classify.accuracy(BernoulliNB_classifer, testing_set)) * 100))

save_classifier = open("pickled_algos/BernoulliNB_classifier5k.pickle", "wb")
pickle.dump(BernoulliNB_classifer, save_classifier)
save_classifier.close()

# SVC_classifer = SklearnClassifier(SVC())
# SVC_classifer.train(training_set)
# print("SVC: "+ str((nltk.classify.accuracy(SVC_classifer, testing_set))*100))

LinearSVC_classifer = SklearnClassifier(LinearSVC())
LinearSVC_classifer.train(training_set)
print("LinearSVC: " +
 testing_set = featuresets[800:]
 
 classifier = nltk.NaiveBayesClassifier.train(training_set)
 
 print("NaiveBayes Classifier accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*1000)
 
 classifier.show_most_informative__features(15)
 
 from sklearn import metrics
 
 MNB_clf = SKlearnClassifer(MultinomialNB())
 mnb_cls = MNB_clf_train(training_set)
 
 print("MultinomialNB Classifier accuracy percent:", (nltk.classify.accuracy(mnb_cls, testing_set))*100)
 
 BNB_clf = SKlearnClassifer(BernoulliNB())
 bnb_cls = BNB_clf.clf.train(training_set)
 
 print("BernoulliNB Classifier accuracy percent:", (nltk.classify.accuracy(bnb_cls, testing_set))*100)
 
 
 LogReg_clf = SKlearnClassifer(LogisticRegression())
 logReg_cls = LogReg_clf.clf.train(training_set)
 
 print("LogisticRegression Classifier accuracy percent:", (nltk.classify.accuracy(logReg_cls, testing_set))*100)
 
 SGD_clf = SKlearnClassifer(SGDClassifier())
 sgd_cls = SGD_clf.clf.train(training_set)
 
 print("SGD Classifier accuracy percent:", (nltk.classify.accuracy(sgd_cls, testing_set))*100)
 
예제 #26
0
파일: bayes.py 프로젝트: 2721485344/Email
# df1 = transformer.fit_transform(jieba_cut_content)

print('=' * 30 + '下面开始svd分解降维计算' + '=' * 30)
svd = TruncatedSVD(n_components=20)
svd_model = svd.fit(df1)
df2 = svd_model.transform(df1)
data = pd.DataFrame(df2)

print('=' * 30 + '重新构建矩阵开始' + '=' * 30)
data['has_date'] = list(x_train['has_date'])
data['content_length_sema'] = list(x_train['content_length_sema'])

#以上完成了数据清洁工作-------->tf-idf文本转换和svd降维

print('=' * 30 + '构建伯努利贝叶斯模型' + '=' * 30)
nb = BernoulliNB(alpha=1.0, binarize=0.0005)  #二值转换阈值
model = nb.fit(data, y_train)

print('=' * 30 + '构建测试集' + '=' * 30)
jieba_cut_content_test = list(x_test['jieba_cut_content'].astype('str'))
data_test = pd.DataFrame(
    svd_model.transform(transformer_model.transform(jieba_cut_content_test)))
data_test['has_date'] = list(x_test['has_date'])
data_test['content_length_sema'] = list(x_test['content_length_sema'])

print('=' * 30 + '开始测试集预测' + '=' * 30)
start = time.time()
y_predict = model.predict(data_test)
end = time.time()
print('朴素贝叶斯预测共耗时%.2f秒' % (end - start))
#Decision Tree Classifier
clf_DT = DecisionTreeClassifier(criterion='gini',
                                splitter='best',
                                max_depth=10,
                                min_samples_split=2,
                                min_samples_leaf=1,
                                min_weight_fraction_leaf=0.0,
                                max_features=None,
                                max_leaf_nodes=None,
                                min_impurity_decrease=1e-07)
clf_DT.fit(X_train, y_train)
y_pred_DT = clf_DT.predict(X_val)

#Naive Bayes Classifier
clf_NB = BernoulliNB()
clf_NB.fit(X_train, y_train)
y_pred_NB = clf_NB.predict(X_val)

#NN Classifier
MLPClassifier(activation='relu',
              alpha=1e-05,
              batch_size='auto',
              beta_1=0.9,
              beta_2=0.999,
              early_stopping=False,
              epsilon=1e-08,
              hidden_layer_sizes=(64),
              learning_rate='constant',
              learning_rate_init=0.001,
              max_iter=2000,
# Train SGD with Elastic Net penalty
print 80 * '='
print "Elastic-Net penalty"
results.append(
    benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet")))

# Train NearestCentroid without threshold
print 80 * '='
print "NearestCentroid (aka Rocchio classifier)"
results.append(benchmark(NearestCentroid()))

# Train sparse Naive Bayes classifiers
print 80 * '='
print "Naive Bayes"
results.append(benchmark(MultinomialNB(alpha=.01)))
results.append(benchmark(BernoulliNB(alpha=.01)))


class L1LinearSVC(LinearSVC):
    def fit(self, X, y):
        # The smaller C, the stronger the regularization.
        # The more regularization, the more sparsity.
        self.transformer_ = LinearSVC(penalty="l1", dual=False, tol=1e-3)
        X = self.transformer_.fit_transform(X, y)
        return LinearSVC.fit(self, X, y)

    def predict(self, X):
        X = self.transformer_.transform(X)
        return LinearSVC.predict(self, X)

예제 #29
0
    Y = data[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

    return (X_train, X_test, y_train, y_test)


X, X_test, y, y_test = split_data('a_processed.txt')
#逻辑回归二分类模型
model = LogisticRegression(solver='liblinear')
model.fit(X, y)
model.score(X, y)
weights = model.coef_
intercept = model.intercept_
predicted = model.predict(X_test)
#伯努利朴素贝叶斯模型
clf = BernoulliNB()
clf.fit(X, y)
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
#多项式分布模型
mul = MultinomialNB().fit(X, y)


#print (clf.predict(X_test))
#预测模型
def prediction(pre, ac, right, wrong):
    for i in range(len(pre)):
        if pre[i] == ac[i]:
            right = right + 1
        else:
            wrong = wrong + 1
    rate = right / (right + wrong)
예제 #30
0
  print("TRAIN:", train_index, "TEST:", test_index)
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  clf.fit(X_train, y_train)
  score = f05_scorer(clf, X_test, y_test)
  if score > best_score:
    best_clf = clf
    best_score = score

fout = open('kbest-multinomialNB.pickle','w')
pickle.dump(clf,fout)
fout.close()

#######################
print "Bernoulli NB"
clf = BernoulliNB(binarize = 0.0, alpha = 0.25, fit_prior = False)

kf = KFold(72000, n_folds=10, shuffle=True)
best_score = 0
best_clf = 0
for train_index, test_index in kf:
  print("TRAIN:", train_index, "TEST:", test_index)
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  clf.fit(X_train, y_train)
  score = f05_scorer(clf, X_test, y_test)
  if score > best_score:
    best_clf = clf
    best_score = score

fout = open('kbest-bernoulliNB.pickle','w')