train_label_fn = 'train-labels-100.txt' test_data_fn = 'test-features.txt' test_label_fn = 'test-labels.txt' (train_data, train_label) = read_data(train_data_fn, train_label_fn) (test_data, test_label) = read_data(test_data_fn, test_label_fn) clf = MultinomialNB() clf.fit(train_data, train_label) y_pred = clf.predict(test_data) print('Training size = %d, accuracy = %.2f%%' % \ (train_data.shape[0],accuracy_score(test_label, y_pred)*100)) train_data_fn = 'train-features-50.txt' train_label_fn = 'train-labels-50.txt' test_data_fn = 'test-features.txt' test_label_fn = 'test-labels.txt' (train_data, train_label) = read_data(train_data_fn, train_label_fn) (test_data, test_label) = read_data(test_data_fn, test_label_fn) clf = MultinomialNB() clf.fit(train_data, train_label) y_pred = clf.predict(test_data) print('Training size = %d, accuracy = %.2f%%' % \ (train_data.shape[0],accuracy_score(test_label, y_pred)*100)) clf = BernoulliNB(binarize = .5) clf.fit(train_data, train_label) y_pred = clf.predict(test_data) print('Training size = %d, accuracy = %.2f%%' % \ (train_data.shape[0],accuracy_score(test_label, y_pred)*100))
def main(args): with tf.Graph().as_default(): with tf.Session() as sess: np.random.seed(seed=args.seed) if args.use_split_dataset: dataset_tmp = facenet.get_dataset(args.data_dir) train_set, test_set = split_dataset( dataset_tmp, args.min_nrof_images_per_class, args.nrof_train_images_per_class) if (args.mode == 'TRAIN'): dataset = train_set elif (args.mode == 'CLASSIFY'): dataset = test_set else: dataset = facenet.get_dataset(args.data_dir) # Check that there are at least one training image per class for cls in dataset: assert ( len(cls.image_paths) > 0, 'There must be at least one image for each class in the dataset' ) paths, labels = facenet.get_image_paths_and_labels(dataset) print('Number of classes: %d' % len(dataset)) print('Number of images: %d' % len(paths)) # Load the model print('Loading feature extraction model') facenet.load_model(args.model) # Get input and output tensors images_placeholder = tf.get_default_graph().get_tensor_by_name( "input:0") embeddings = tf.get_default_graph().get_tensor_by_name( "embeddings:0") phase_train_placeholder = tf.get_default_graph( ).get_tensor_by_name("phase_train:0") embedding_size = embeddings.get_shape()[1] # Run forward pass to calculate embeddings print('Calculating features for images') nrof_images = len(paths) nrof_batches_per_epoch = int( math.ceil(1.0 * nrof_images / args.batch_size)) emb_array = np.zeros((nrof_images, embedding_size)) for i in range(nrof_batches_per_epoch): start_index = i * args.batch_size end_index = min((i + 1) * args.batch_size, nrof_images) paths_batch = paths[start_index:end_index] images = facenet.load_data(paths_batch, False, False, args.image_size) feed_dict = { images_placeholder: images, phase_train_placeholder: False } emb_array[start_index:end_index, :] = sess.run( embeddings, feed_dict=feed_dict) classifier_filename_exp = os.path.expanduser( args.classifier_filename) if (args.mode == 'TRAIN'): # Train classifier print('Training classifier') # model = GaussianNB() # 0 or 1값만 반환 model = BernoulliNB() # model = SVC(kernel='linear', probability=True) model.fit(emb_array, labels) # Create a list of class names class_names = [cls.name.replace('_', ' ') for cls in dataset] # Saving classifier model with open(classifier_filename_exp, 'wb') as outfile: pickle.dump((model, class_names), outfile) print('Saved classifier model to file "%s"' % classifier_filename_exp) elif (args.mode == 'CLASSIFY'): # Classify images print('Testing classifier') with open(classifier_filename_exp, 'rb') as infile: (model, class_names) = pickle.load(infile) print('Loaded classifier model from file "%s"' % classifier_filename_exp) predictions = model.predict_proba(emb_array) best_class_indices = np.argmax(predictions, axis=1) best_class_probabilities = predictions[ np.arange(len(best_class_indices)), best_class_indices] for i in range(len(best_class_indices)): print('%4d %s: %.3f' % (i, class_names[best_class_indices[i]], best_class_probabilities[i])) accuracy = np.mean(np.equal(best_class_indices, labels)) print('Accuracy: %.3f' % accuracy)
) # Train NearestCentroid without threshold benchmark( NearestCentroid(), name="NearestCentroid (aka Rocchio classifier)" ) # Train sparse Naive Bayes classifiers benchmark( MultinomialNB(alpha=.01), name="Naive Bayes MultinomialNB" ) benchmark( BernoulliNB(alpha=.01), name="Naive Bayes BernoulliNB" ) benchmark( ComplementNB(alpha=.1), name="Naive Bayes ComplementNB" ) # The smaller C, the stronger the regularization. # The more regularization, the more sparsity. benchmark( Pipeline([ ('feature_selection', SelectFromModel( LinearSVC(
predicted = grid_search.predict(docs_new) print(predicted) for doc, category in zip(['hit1', 'hit2', 'miss1', 'miss2'], predicted): print('%r => %s' % (doc, dataset.target_names[category])) # In[105]: from sklearn.naive_bayes import BernoulliNB pipeline = Pipeline([ ('vect', TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.95, stop_words='english')), ('clf', BernoulliNB()), ]) pipeline.fit(docs_train, y_train) # In[106]: y_predicted = pipeline.predict(docs_test) print( metrics.classification_report(y_test, y_predicted, target_names=dataset.target_names)) # In[107]: cm = metrics.confusion_matrix(y_test, y_predicted) print(cm)
def __init__(self, **kwargs): super().__init__() self.classifier = BernoulliNB() self.kwargs = kwargs
from sklearn.naive_bayes import BernoulliNB from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier from sklearn.ensemble import VotingClassifier from sklearn.model_selection import cross_val_score from sklearn import svm import numpy as np data = [[0, 0, 0], [0, 0, 0], [0, 0, 0], [1, 0, 0], [1, 0, 1], [1, 1, 1], [1, 1, 1], [1, 1, 0], [0, 1, 1], [0, 0, 1]] X = np.array(data) Y = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1, 0]) test = np.array([[0, 1, 0]]) clf = BernoulliNB(alpha=0.000001) #clf.fit(X, Y) #print("Naive Bayes:", clf.predict_proba(test)) clf2 = DecisionTreeClassifier() #clf2.fit(X, Y) #print("DT:", clf2.predict_proba(test)) clf3 = KNeighborsClassifier() #clf3.fit(X, Y) #print("KNN", clf3.predict_proba(test)) clf4 = MLPClassifier(max_iter=1000) #clf4.fit(X, Y) #print("MLP", clf4.predict_proba(test))
import numpy as np import pandas as pd from sklearn.kernel_approximation import RBFSampler from sklearn.model_selection import train_test_split from sklearn.naive_bayes import BernoulliNB from sklearn.pipeline import make_pipeline from tpot.builtins import DatasetSelector # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=66) # Average CV score on the training set was:0.6476751946607341 exported_pipeline = make_pipeline( DatasetSelector(sel_subset=14, subset_list="subsets.csv"), RBFSampler(gamma=0.55), BernoulliNB(alpha=0.01, fit_prior=True) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
##datasets with a validation set X_train2 = full_df[:1120000, :] X_valid = full_df[1120000:1600000, :] from sklearn.preprocessing import LabelEncoder le = LabelEncoder() y_train1 = le.fit_transform(full_data['Sentiment']) y_train2 = le.transform(full_data['Sentiment'][:1120000]) y_valid = le.transform(full_data['Sentiment'][1120000:]) y_test = le.transform(test_data_pos_neg['Sentiment']) ######Try Binomial Naive Bayes Model without word stemming###### from sklearn.naive_bayes import BernoulliNB ##Convert word frequency matrix into binary matrix X_train1_bin = X_train1.copy() X_train1_bin[X_train1_bin > 0] = 1 clf_ber_bayes = BernoulliNB() clf_ber_bayes.fit(X_train1_bin, y_train1) train_preds = clf_ber_bayes.predict(X_train1_bin) accuracy_score(train_preds, y_train1) #Convert test dataframe to binary X_test_bin = X_test.copy() X_test_bin[X_test_bin > 0] = 1 test_preds = clf_ber_bayes.predict(X_test_bin) accuracy_score(y_test, test_preds) ##84.12 % accuracy_score
print Matr.shape Matr=Matr[1:] print len(Yval) a=1000 b=100000 prior1=(a+spamc-1)*1.0/(a+b+spamc+legitc-2) prior2=(a+legitc-1)*1.0/(a+b+spamc+legitc-2) # y=beta.pdf(x, a, b) from sklearn.metrics import precision_recall_curve from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import BernoulliNB from sklearn.cross_validation import train_test_split a_train, a_test, b_train, b_test = train_test_split(Matr, Yval, test_size=0.2, random_state=42) clf = MultinomialNB(class_prior=[1,2]) clf2= BernoulliNB(class_prior=[prior1,prior2]) clf.fit(a_train, b_train) clf2.fit(a_train, b_train) Ax=clf.predict(a_test) Bx=clf2.predict(a_test) from sklearn.metrics import f1_score #print f1_score(b_test, Ax, average='macro') print f1_score(b_test, Bx, average='macro') import matplotlib.pyplot as plt precision, recall, _ = precision_recall_curve(b_test, Bx) plt.step(recall, precision, color='b', alpha=0.2,where='post') plt.fill_between(recall, precision, step='post', alpha=0.2,color='b')
def main(): show_plots = False #set to True to show plots, False to not show plots #read categories from arguments. e.g. "python3 test.py Comedy Drama Documentary Horror" categories = [] for arg in sys.argv[1:]: categories.append(arg) X, y, files_used = read_files(categories) try: high_info_words = high_information_words(X, y) X_high_info = [] for bag in X: new_bag = [] for words in bag: if words in high_info_words: new_bag.append(words) X_high_info.append(new_bag) except ZeroDivisionError: print("Not enough information too get high information words, please try again with more files.", file=sys.stderr) X_high_info = X X_wpm = wpm(files_used, categories, show_plots) X_dpm = dpm(files_used, categories, show_plots) X_wd = word_distribution(files_used, categories) doc2vec_model = Doc2Vec.load("d2v_150.model") #doc2vec_model = Doc2Vec.load("d2v_400.model") #Reason I don't infer the vector is that I used the data already while training the vector model (with tagged docoments), so I can just retrieve the data X_d2v = [doc2vec_model.docvecs[str(i)] for i in range(len(X))] #X_d2v = [doc2vec_model.infer_vector(to_list(str(i))) for i in X] X = [(str(x), str(x_high), wpm, dpm, wd, d2v) for x, x_high, wpm, dpm, wd, d2v in zip(X, X_high_info, X_wpm, X_dpm, X_wd, X_d2v)] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10) clfs = [ SVC(C=10, cache_size=500, class_weight=None, coef0=0.0, #parameters found using grid_search.py decision_function_shape=None, degree=3, gamma=0.0001, kernel='linear', max_iter=100000, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False), MultinomialNB(alpha=1.0), BernoulliNB(), ] pipeline = Pipeline([ # Extract the features ('features', FeaturesExtractor()), # Use FeatureUnion to combine the features from subject and body ('union', FeatureUnion( transformer_list=[ #Pipeline bag-of-words model ('text', Pipeline([ ('selector', ItemSelector(key='text')), ('tfidf', TfidfVectorizer(sublinear_tf=True, binary=True, norm='l2', ngram_range=(1,3))), #('chi-square', SelectKBest(chi2, 300)), ])), #Pipeline for high info words bag-of-words model ('text_high', Pipeline([ ('selector', ItemSelector(key='text_high')), ('tfidf', TfidfVectorizer(sublinear_tf=True, norm='l2')), ])), #Pipeline for wpm feature ('wpm', Pipeline([ ('selector', ItemSelector(key='wpm')), ('scaler', MinMaxScaler()), ])), #Pipeline for dpm feature ('dpm', Pipeline([ ('selector', ItemSelector(key='dpm')), ('scaler', MinMaxScaler()), ])), #Pipeline for wd feature ('wd', Pipeline([ ('selector', ItemSelector(key='wd')), ('scaler', MinMaxScaler()), ])), #Pipeline for d2v feature ('d2v', Pipeline([ ('selector', ItemSelector(key='d2v')), ('scaler', MinMaxScaler()), ])), #Pipeline for POS tag features # ('pos', Pipeline([ # ('selector', ItemSelector(key='pos')), # ('words', TfidfVectorizer(sublinear_tf=True, binary=True, norm='l2', ngram_range=(1,3))) # ])), ], # weight components in FeatureUnion transformer_weights={ 'text': 0.2, 'text_high' : 1, 'wpm': 0, 'dpm': 0.2, 'wd': 0, 'd2v': 0, #'pos': 0, }, )), # Use a classifier on the combined features ('classifier', clfs[0]), ]) train(pipeline, X_train, y_train, categories, show_plots) final_pred = pipeline.predict(X_test) print("\nScores on test set:\n") print(metrics.accuracy_score(y_test, final_pred)) print(metrics.classification_report(y_test, final_pred, digits=3)) confusion_m = metrics.confusion_matrix(y_test, final_pred, labels=categories) plt.figure(figsize = (16, 9), dpi=150) sn.set(font_scale=1.4) #label size hm = sn.heatmap(confusion_m, annot=True, fmt='g', annot_kws={"size": 16}) #font size hm.set(xticklabels = categories, yticklabels = categories) plt.title(str(pipeline.named_steps['classifier']).split("(")[0] + ' Confusion Matrix') if show_plots: plt.show() hm.figure.savefig(str(pipeline.named_steps['classifier']).split("(")[0] + '_confusion_matrix_test' + '.png', figsize = (16, 9), dpi=150) plt.close()
classifiers.append(GradientBoostingClassifier(random_state=random_state)) classifiers.append(RandomForestClassifier(random_state=random_state)) #Gaussian process classifiers.append(GaussianProcessClassifier(random_state=random_state)) #Generalized linear models classifiers.append(LogisticRegressionCV(random_state=random_state)) classifiers.append(PassiveAggressiveClassifier(random_state=random_state)) classifiers.append(RidgeClassifierCV()) classifiers.append(SGDClassifier(random_state=random_state)) classifiers.append(Perceptron(random_state=random_state)) classifiers.append(MLPClassifier(random_state=random_state)) #Navies Bayes classifiers.append(BernoulliNB()) classifiers.append(GaussianNB()) #Nearest Neighbors classifiers.append(KNeighborsClassifier()) #Discrimnant analysis classifiers.append(LinearDiscriminantAnalysis()) #Support vector machine classifiers.append(SVC(random_state=random_state, probability=True)) classifiers.append(NuSVC(random_state=random_state, probability=True)) classifiers.append(LinearSVC(random_state=random_state)) #Trees classifiers.append(DecisionTreeClassifier(random_state=random_state))
# [5] Результат в процентах from sklearn.naive_bayes import BernoulliNB from sklearn.svm import LinearSVC from sklearn.tree import DecisionTreeClassifier from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, confusion_matrix import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") results = [] for clf, name in [ (BernoulliNB(alpha=0.4), 'Native Bayes'), (LinearSVC(C=9), 'SVC'), ( DecisionTreeClassifier(max_depth=26), 'DecisionTreeClassifier', ), # (LogisticRegression(C=12), 'LogisticRegression'), # (RandomForestClassifier(max_depth=2, random_state=0), 'RandomForest'), (KNeighborsClassifier(n_neighbors=13), 'KNN') ]: # Y_train.reshape(Y_train.shape[0],) # Y_test.reshape(Y_test.shape[0]) clf.fit(X_train, Y_train) predictions = clf.predict(X_train) training_accuracy = accuracy_score(predictions, Y_train)
from scorer_semeval18 import main as eval tokenized_tweets = pickle.load(open(TOK_TWEETS_PATH, 'rb')) print('loaded tweets') data_matrix = construct_data_matrix(tokenized_tweets) print('constructed data matrix') print('Dim:', data_matrix.shape) print('Density:', np.count_nonzero(data_matrix) / np.size(data_matrix)) labels = np.asarray(open(CLEAN_LABELS_PATH).read().splitlines()) data_train, data_test, labels_train, labels_test = split_data( data_matrix, labels) print('split data') bern = BernoulliNB() bern.fit(data_train, labels_train) print("\nbern", bern.score(data_test, labels_test)) eval(labels_test, bern.predict(data_test)) multi = MultinomialNB() multi.fit(data_train + abs(np.min(data_train)), labels_train) print("\nmulti", multi.score(data_test + abs(np.min(data_test)), labels_test)) eval(labels_test, multi.predict(data_test)) tree = DecisionTreeClassifier(max_depth=10) tree.fit(data_train, labels_train) print("\ntree", tree.score(data_test, labels_test)) eval(labels_test, tree.predict(data_test)) clf = RandomForestClassifier(max_depth=3)
def classify(X, y, clf_type='nbc'): """ Preprocess the input documents to extract feature vector representations of them. Your features should be N-gram counts, for N<=2. 1. Experiment with the complexity of the N-gram features (i.e., unigrams, or unigrams and bigrams): `gram_min` + `gram_max` 2. Experiment with removing stop words. (see NLTK) 3. Remove infrequently occurring words and bigrams as features. You may tune the threshold at which to remove infrequent words and bigrams. 4. Search over hyperparameters for the three models (nb, svm, lr) to find the best performing model. All 4 of the above are done in the context of 10-fold cross validation on the data. On the training data, 3-fold cross validation is done to find the optimal hyperparameters (using randomized CV), which are then tested on held-out data. """ if clf_type == 'nbc': clf = BernoulliNB() params = SETTINGS_NB elif clf_type == 'svc': clf = LinearSVC() params = SETTINGS_SVC elif clf_type == 'lrc': clf = LogisticRegression() params = SETTINGS_LR else: raise Exception('invalid clf {}: {nbc, svc, lrc}'.format(clf_type)) # pipeline runs preprocessing and model during every CV loop pipe = Pipeline([ ('pre', CountVectorizer()), ('clf', clf), ]) model = RandomizedSearchCV( pipe, params, n_jobs=-1, n_iter=N_CV, cv=INNER, scoring='f1_macro' ) results = { 'test': {'loss': [], 'accuracy': [], 'confusion': [], 'errors': []}, 'train': {'loss': [], 'accuracy': [], 'confusion': []}, 'cv': {} } kf = StratifiedKFold(n_splits=FOLDS, shuffle=True) for i, (train_idx, test_idx) in enumerate(kf.split(X, y)): print("[{}] {}/{}".format(clf_type, i+1, FOLDS)) # split training and test sets X_train = X[train_idx] X_test = X[test_idx] y_train = y[train_idx] y_test = y[test_idx] # fit model model.fit(X_train, y_train) # save the best parameters from the inner-fold cross validation best_params = model.best_estimator_.get_params() for p in sorted(params.keys()): results['cv'][p] = best_params[p] # make predictions on train and test set y_test_pred = model.predict(X_test) y_train_pred = model.predict(X_train) # record some misclassified sentences idx_errors = np.where(y_test_pred != y_test)[0] np.random.shuffle(idx_errors) errors = X_test[idx_errors[:5]] results['test']['errors'].extend(errors) # store results results['test']['loss'].append(log_loss(y_test, y_test_pred)) results['test']['accuracy'].append(accuracy_score(y_test, y_test_pred)) results['test']['confusion'].append(confusion_matrix(y_test, y_test_pred)) results['train']['loss'].append(log_loss(y_train, y_train_pred)) results['train']['accuracy'].append(accuracy_score(y_train, y_train_pred)) results['train']['confusion'].append(confusion_matrix(y_train, y_train_pred)) return(results)
# vectorizer = TfidfVectorizer(use_idf=True, norm="l2") X = vectorizer.fit_transform(all_reviews).toarray() pos = len(pos_reviews) neg = len(neg_reviews) y = [1] * pos + [0] * neg '''Splitting data into Training and Testing Sets''' # divide data into 20% test set and 80% training set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) '''Training and testing the model''' from sklearn.naive_bayes import BernoulliNB, MultinomialNB classifier = BernoulliNB(alpha=0.001) classifier.fit(X_train, y_train) # predict the sentiment for the documents in our test y_pred = classifier.predict(X_test) '''Check the accuracy of the model''' print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) print(accuracy_score(y_test, y_pred)) # '''Saving and Loading the Model''' # # save trained model for later use. # with open('RF_classifier', 'wb') as picklefile: # pickle.dump(classifier, picklefile) # # to load the model use # with open('RF_classifier', 'rb') as training_model:
votes.append(v) return str(mode(votes)[0]) def confidence(self, features): votes =[] for c in self._classifiers: v = c.predict(features) votes.append(v) choice_votes = int(mode(votes)[1]) conf = choice_votes / len(votes) return conf #def test_accuracy(self, x2,x3,x4,x5,x6, x7): # average = mean([x2,x3,x4,x5,x6, x7]) # return average BNB = BernoulliNB() BNB.fit(tfidf_train, y_train) pred = BNB.predict(tfidf_test) score = metrics.accuracy_score(y_test, pred) x2 = metrics.accuracy_score(y_test, pred) print("BernoulliNB Naive Bayes Accuracy: %0.3f" % score) #cm = metrics.confusion_matrix(y_test, pred, labels=[0,1]) #plot_confusion_matrix(cm, classes=[0, 1]) save_classifier = open("Pickled/BernoulliNB.pickle", "wb") pickle.dump(BNB, save_classifier) save_classifier.close() LR = LogisticRegression() LR.fit(tfidf_train, y_train) pred = LR.predict(tfidf_test)
def __init__(self): self.name = "nb" self.model = BernoulliNB()
def classification_naive_bayes(X, Y, nome): nb_model = BernoulliNB() classification_model_cv(X, Y, nb_model, "Naive Bayes "+nome)
import numpy as np X = np.random.randint(100, size=(10000, 100)) Y = np.random.randint(5, size=(10000, 1)) X.shape Y.shape from sklearn.naive_bayes import BernoulliNB clf = BernoulliNB() clf.fit(X, Y) Z = np.random.randint(10, size=(1, 100)) print(clf.predict(Z))
def classification_voting(X,y, nome): clf2 = RandomForestClassifier(n_estimators=30, max_depth=None, min_samples_split=2, random_state=0) clf3 = BernoulliNB() eclf2 = VotingClassifier(estimators=[('rf', clf2), ('bnb', clf3)],voting = 'soft') classification_model_cv(X, y, eclf2, "Voting Model "+nome)
df = pd.concat([df, dummy], axis=1) df = df.drop(columns=feat, axis=1) dummy = None # split data to train, heldout, and test datasets print('INFO: Spliting data into train/heldout/test datasets.') x_train = np.array(df[df['data'] == 'T'].drop(columns=['data', 'result'])) y_train = np.array(df[df['data'] == 'T']['result'].astype('bool')) x_valid = np.array(df[df['data'] == 'V'].drop(columns=['data', 'result'])) y_valid = np.array(df[df['data'] == 'V']['result'].astype('bool')) x_hold = np.array(df[df['data'] == 'H'].drop(columns=['data', 'result'])) y_hold = np.array(df[df['data'] == 'H']['result'].astype('bool')) # machine learning classification models classif = [('Gaussian Naive Bayes', GaussianNB()), ('Bernoulli Naive Bayes', BernoulliNB()), ('COmplement Naive Bayes', ComplementNB()), ('Multinomial Naive Bayes', MultinomialNB()), ('LOGistic Regression', LogisticRegression(solver='liblinear', multi_class='ovr', penalty='l2', random_state=24)), ('LOGistic Regression 2', LogisticRegression(solver='saga', multi_class='ovr', l1_ratio=0.3, penalty='elasticnet', max_iter=1000, random_state=24)), ('LOGistic Regression 3',
print("t = %d" % X_valid.shape[0]) print("Num classes = %d" % len(np.unique(y))) model = RandomForestClassifier() model.fit(X, y) y_pred = model.predict(X_valid) v_error = np.mean(y_pred != y_valid) print("Random Forest (sklearn) validation error: %.3f" % v_error) model = NaiveBayes(num_classes=4, beta=1) model.fit(X, y) y_pred = model.predict(X_valid) v_error = np.mean(y_pred != y_valid) print("Naive Bayes (ours) validation error: %.3f" % v_error) model = BernoulliNB() model.fit(X, y) y_pred = model.predict(X_valid) v_error = np.mean(y_pred != y_valid) print("Naive Bayes (sklearn) validation error: %.3f" % v_error) elif question == '2': dataset = load_dataset('vowel.pkl') X = dataset['X'] y = dataset['y'] X_test = dataset['Xtest'] y_test = dataset['ytest'] print("\nn = %d, d = %d\n" % X.shape) def evaluate_model(model): model.fit(X,y)
print("Total sarcastic lines = " + str(sarcasm_size)) neutral_size = len(neutral) print("Total non-sarcastic lines = " + str(neutral_size)) for i in range(0, sarcasm_size): labels.append(1) for i in range(0, neutral_size): labels.append(0) print(len(labels)) dataset = np.concatenate([sarcasm, neutral]) print("Total length of dataset = " + str(len(dataset))) #Classify using Naive Bayes: from sklearn.naive_bayes import BernoulliNB vec, clf = TfidfVectorizer(min_df=5), BernoulliNB() td_matrix = vec.fit_transform(dataset) print("Shape of matrix = " + str(td_matrix.shape)) print("Length of the labels = " + str(len(labels))) X_train, X_test, y_train, y_test = train_test_split(td_matrix, labels, test_size=0.2, random_state=0) clf.fit(X_train, y_train) y_out = clf.predict(X_test) print("Accuracy on held-out data: ", str(100 * accuracy_score(y_out, y_test))[0:5], "%\n") #Accuracy on held-out data: MultinomialNB 83.79 %, BernoulliNB 84.49%, DecisionTree=84.40%, RandomForest=82.39%
classifier.show_most_informative_features(15) save_classifier = open("pickled_algos/originalnaivebayes5k.pickle", "wb") pickle.dump(classifier, save_classifier) save_classifier.close() MNB_classifer = SklearnClassifier(MultinomialNB()) MNB_classifer.train(training_set) print("MNB Acc: " + str((nltk.classify.accuracy(MNB_classifer, testing_set)) * 100)) save_classifier = open("pickled_algos/MNB_classifier5k.pickle", "wb") pickle.dump(MNB_classifer, save_classifier) save_classifier.close() BernoulliNB_classifer = SklearnClassifier(BernoulliNB()) BernoulliNB_classifer.train(training_set) print("BernoulliNB Acc: " + str((nltk.classify.accuracy(BernoulliNB_classifer, testing_set)) * 100)) save_classifier = open("pickled_algos/BernoulliNB_classifier5k.pickle", "wb") pickle.dump(BernoulliNB_classifer, save_classifier) save_classifier.close() # SVC_classifer = SklearnClassifier(SVC()) # SVC_classifer.train(training_set) # print("SVC: "+ str((nltk.classify.accuracy(SVC_classifer, testing_set))*100)) LinearSVC_classifer = SklearnClassifier(LinearSVC()) LinearSVC_classifer.train(training_set) print("LinearSVC: " +
testing_set = featuresets[800:] classifier = nltk.NaiveBayesClassifier.train(training_set) print("NaiveBayes Classifier accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*1000) classifier.show_most_informative__features(15) from sklearn import metrics MNB_clf = SKlearnClassifer(MultinomialNB()) mnb_cls = MNB_clf_train(training_set) print("MultinomialNB Classifier accuracy percent:", (nltk.classify.accuracy(mnb_cls, testing_set))*100) BNB_clf = SKlearnClassifer(BernoulliNB()) bnb_cls = BNB_clf.clf.train(training_set) print("BernoulliNB Classifier accuracy percent:", (nltk.classify.accuracy(bnb_cls, testing_set))*100) LogReg_clf = SKlearnClassifer(LogisticRegression()) logReg_cls = LogReg_clf.clf.train(training_set) print("LogisticRegression Classifier accuracy percent:", (nltk.classify.accuracy(logReg_cls, testing_set))*100) SGD_clf = SKlearnClassifer(SGDClassifier()) sgd_cls = SGD_clf.clf.train(training_set) print("SGD Classifier accuracy percent:", (nltk.classify.accuracy(sgd_cls, testing_set))*100)
# df1 = transformer.fit_transform(jieba_cut_content) print('=' * 30 + '下面开始svd分解降维计算' + '=' * 30) svd = TruncatedSVD(n_components=20) svd_model = svd.fit(df1) df2 = svd_model.transform(df1) data = pd.DataFrame(df2) print('=' * 30 + '重新构建矩阵开始' + '=' * 30) data['has_date'] = list(x_train['has_date']) data['content_length_sema'] = list(x_train['content_length_sema']) #以上完成了数据清洁工作-------->tf-idf文本转换和svd降维 print('=' * 30 + '构建伯努利贝叶斯模型' + '=' * 30) nb = BernoulliNB(alpha=1.0, binarize=0.0005) #二值转换阈值 model = nb.fit(data, y_train) print('=' * 30 + '构建测试集' + '=' * 30) jieba_cut_content_test = list(x_test['jieba_cut_content'].astype('str')) data_test = pd.DataFrame( svd_model.transform(transformer_model.transform(jieba_cut_content_test))) data_test['has_date'] = list(x_test['has_date']) data_test['content_length_sema'] = list(x_test['content_length_sema']) print('=' * 30 + '开始测试集预测' + '=' * 30) start = time.time() y_predict = model.predict(data_test) end = time.time() print('朴素贝叶斯预测共耗时%.2f秒' % (end - start))
#Decision Tree Classifier clf_DT = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=10, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, max_leaf_nodes=None, min_impurity_decrease=1e-07) clf_DT.fit(X_train, y_train) y_pred_DT = clf_DT.predict(X_val) #Naive Bayes Classifier clf_NB = BernoulliNB() clf_NB.fit(X_train, y_train) y_pred_NB = clf_NB.predict(X_val) #NN Classifier MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08, hidden_layer_sizes=(64), learning_rate='constant', learning_rate_init=0.001, max_iter=2000,
# Train SGD with Elastic Net penalty print 80 * '=' print "Elastic-Net penalty" results.append( benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet"))) # Train NearestCentroid without threshold print 80 * '=' print "NearestCentroid (aka Rocchio classifier)" results.append(benchmark(NearestCentroid())) # Train sparse Naive Bayes classifiers print 80 * '=' print "Naive Bayes" results.append(benchmark(MultinomialNB(alpha=.01))) results.append(benchmark(BernoulliNB(alpha=.01))) class L1LinearSVC(LinearSVC): def fit(self, X, y): # The smaller C, the stronger the regularization. # The more regularization, the more sparsity. self.transformer_ = LinearSVC(penalty="l1", dual=False, tol=1e-3) X = self.transformer_.fit_transform(X, y) return LinearSVC.fit(self, X, y) def predict(self, X): X = self.transformer_.transform(X) return LinearSVC.predict(self, X)
Y = data[:, -1] X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3) return (X_train, X_test, y_train, y_test) X, X_test, y, y_test = split_data('a_processed.txt') #逻辑回归二分类模型 model = LogisticRegression(solver='liblinear') model.fit(X, y) model.score(X, y) weights = model.coef_ intercept = model.intercept_ predicted = model.predict(X_test) #伯努利朴素贝叶斯模型 clf = BernoulliNB() clf.fit(X, y) BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) #多项式分布模型 mul = MultinomialNB().fit(X, y) #print (clf.predict(X_test)) #预测模型 def prediction(pre, ac, right, wrong): for i in range(len(pre)): if pre[i] == ac[i]: right = right + 1 else: wrong = wrong + 1 rate = right / (right + wrong)
print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) score = f05_scorer(clf, X_test, y_test) if score > best_score: best_clf = clf best_score = score fout = open('kbest-multinomialNB.pickle','w') pickle.dump(clf,fout) fout.close() ####################### print "Bernoulli NB" clf = BernoulliNB(binarize = 0.0, alpha = 0.25, fit_prior = False) kf = KFold(72000, n_folds=10, shuffle=True) best_score = 0 best_clf = 0 for train_index, test_index in kf: print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) score = f05_scorer(clf, X_test, y_test) if score > best_score: best_clf = clf best_score = score fout = open('kbest-bernoulliNB.pickle','w')