def train_model(X, y_train): """ 模型训练 :return: """ model_NB = MNB() model_NB.fit(X, y_train) # 特征数据直接灌进来 MNB(alpha=1.0, class_prior=None, fit_prior=True) score = np.mean( cross_val_score(model_NB, X, y_train, cv=20, scoring='roc_auc')) logging.info(f'多项式贝叶斯分类器20折交叉验证得分{score}')
def main(): #import Preprocessing class from preprocess from preprocess import Preprocessing prepro = Preprocessing #get data and lable from Preprocessing class X, Y = prepro.split_data() #convert data into features list feature_set, lable = make_dataset(X, Y) #split data into training and testing data X_train, X_test, Y_train, Y_test = tts(feature_set, lable, test_size=0.2) #making classifier object using Multinomial Naive Bayes classifier = MNB() #training the classifier with Trainining data feature and lables classifier.fit(X_train, Y_train) #testing the classifier predictions = classifier.predict(X_test) #calculate accuracy by comparing prediction make test data's lable print("Accuracy of Classifier :") print(accuracy_score(Y_test, predictions)) #saving classifier in a file. with open('spam_classifier.mdl', 'wb') as scla: pickle.dump(classifier, scla)
def mnb(self): clf = MNB() clf.fit(self.X_train, self.Y_Train) y_predict = clf.predict(self.X_Test) score = clf.score(self.X_Test, self.Y_Test) print("using mnb, score %s" % score) print(classification_report(self.Y_Test, y_predict))
def main(): df = pd.read_csv("/home/saxobeat/PythonML/MLCodes/Spambase/Dataset/spamdata.csv") features = df.iloc[:,0:57].values labels = df.iloc[:,57].values X_train, X_test, y_train, y_test = tts(features, labels, test_size=0.25, shuffle=True, random_state=8) models = [] models.append(('LR', LR(solver='lbfgs', max_iter=2000, tol=0.0001))) models.append(('LDA', LDA())) models.append(('DTC', DTC())) models.append(('KNC', KNC())) models.append(('MNB', MNB())) models.append(('RFC', RFC(n_estimators=100))) models.append(('SVC', SVC(gamma='scale', kernel='rbf', probability=True))) x0 = np.linspace(0,1,10) plt.plot([0,1],[0,1],'k',linestyle='--') for name,model in models: model.fit(X_train, y_train) y_pred = model.predict_proba(X_test) y_score = y_pred[:,1] fpr, tpr, thresholds = roc_curve(y_test, y_score) label = "{}({})".format(name,auc(fpr, tpr)) plt.plot(fpr,tpr,label=label) plt.legend() # plt.legend(name) plt.title("Reciever Operating Characteristics") plt.grid() plt.cool() plt.xlabel("Fasle Positive Rate") plt.ylabel("True Positive Rate") plt.savefig("roc.pdf")
def pred(): # Load fitted training data trainAfterFit = pickle.load(open("../picks/fittedTrainData.pkl", "rb")) # Load prediction column predCol = pickle.load(open("../picks/predCol", "rb")) # Load fitted test data testAfterFit = pickle.load(open("../picks/fittedTestData.pkl", "rb")) # Load test data test = pd.read_csv('../data/testData.tsv', header=0, delimiter="\t", quoting=3) # Initialize MNB Classifier modelMNB = MNB() # Fit the classifier according to the given training data. modelMNB.fit(trainAfterFit, predCol) # Display stats for MB Classifier. This will give us a 20-fold cross validation score that looks at ROC_AUC print( "20 Fold CV Score for Multinomial Naive Bayes: ", np.mean( cross_val_score(modelMNB, trainAfterFit, predCol, cv=20, scoring='roc_auc'))) # Make prediction on fitted test data. These are Probability estimates. The returned estimates for all classes are ordered by the label of classes. MNBresult = modelMNB.predict_proba(testAfterFit)[:, 1] # Create and store predictions in DataFrame and csv MNBoutput = pd.DataFrame(data={"id": test["id"], "sentiment": MNBresult}) MNBoutput.to_csv('../results/MNBPredictions.csv', index=False, quoting=3) # if __name__ == '__main__': # main()
def pipeline_predict(request): if request.method == 'POST': preTest = request.POST.get('text_to_classif') predictTestData = preTest.split() print("*********************", predictTestData) py_pipeline = Pipeline([ ("count", CV()), # ("tfid", TF()), ("multi", MNB()) ]) dbData = Data.objects.all() X_language_train = [] y_language_train = [] for each in dbData: xlt = each.text.split(", ") ylt = len(each.text.split(", ")) * [each.category] X_language_train.extend(xlt) y_language_train.extend(ylt) print(X_language_train, y_language_train) py_pipeline.fit(X_language_train, y_language_train) prediction = py_pipeline.predict(predictTestData) print("*********************", prediction) appearances = defaultdict(int) for curr in prediction: appearances[curr] += 1 answer = max(appearances, key=appearances.get) print("*********************", answer) # score = py_pipeline.score(span_test_data, y) context = {'response': answer} return render(request, 'classifier_app/tindex.html', context)
def mnb(self): clf = MNB() clf.fit(self.X_train, self.Y_Train) print("20 Fold CV Score for Multinomial Naive Bayes: %f" % (np.mean( cross_val_score( clf, self.X_train, self.Y_Train, cv=20, scoring='roc_auc')))) self.best_clf = clf return clf
def createClassifier(type, model_data): if type == "MNB": return Classifier(name='多项式朴素贝叶斯分类器', model=MNB( alpha=model_data["alpha"], fit_prior=model_data["fit_prior"])) elif type == "LinearSVC": return Classifier(name='线性核SVM分类器', model=LinearSVC( tol=model_data["tol"], C=model_data["C"], penalty=model_data["penalty"], loss=model_data["loss"])) else: return False
def build_mnb_model(X_train_dtm, y_train): """ Builds the Multinomial Naive Bayes model :param X_train_dtm: training document-term matrix :param y_train: training target labels :return: fitted Multinomial Naive Bayes model """ mnb = MNB() mnb.fit(X_train_dtm, y_train) return mnb
def _make_tfidf_NB_clf(self, **cfg): max_f = cfg.get('max_features', 1200) max_df = cfg.get('max_df', 0.7) sublin = cfg.get('sublin', True) vectorizer = Tfidf(stop_words='english', norm='l2', max_df=max_df, max_features=max_f, sublinear_tf=sublin) model = MNB() clf = Pipeline(steps=[('v', vectorizer), ('nb', model)]) return clf
def learnData(xData,yData,f_obj,MLtype): f_obj.write('Accuracy for {}:\n'.format(MLtype)) for test in [0.10,0.15,0.20,0.25]: xData_train,xData_test,yData_train,yData_test = tts(xData,yData,test_size=test,random_state=42) if(MLtype=='LSVC'): clf = LSVC() if(MLtype=='LR'): clf = LR() if(MLtype=='MNB'): clf = MNB() else: clf = MLPC() clf.fit(xData_train,yData_train) score = clf.score(xData_test,yData_test) f_obj.write('\ttest partition {} yields {} accuracy\n'.format(test,score)) f_obj.write('\n')
def train(self): logging.info('-' * 20) logging.info('Start training the %s model', self.model) train_data = self.feature_extractor.extract_feature( self.data_loader.get_trainset()) if self.model == 'GNB': # Gaussian naive bayes self.classifier = GNB() elif self.model == 'BNB': # Bernoulli naive bayes self.classifier = BNB() # self.tok = RT(r'\w+') # vectorizer = Vectorizer(tokenizer=self.tok.tokenize) # train_data = self.data_loader.get_trainset() # train_data = [vectorizer.fit_transform(train_data[0]).toarray(), train_data[1]] # self.vocabulary = vectorizer.get_feature_names() elif self.model == 'MNB': # Multinomial naive bayes self.classifier = MNB() elif self.model == 'LR': # Logistic regression param = {'C': [10, 5, 2, 1, 0.5, 0.2, 0.1, 0.05, 0.02, 0.01]} self.classifier = GS(cv=5, estimator=LR(penalty=self.penalty, max_iter=self.epoch, solver='liblinear'), param_grid=param) elif self.model == 'SVM': # Support vector machine self.penalty = self.penalty if self.penalty in ['l1', 'l2' ] else 'l2' dual = self.penalty == 'l2' #self.classifier = SVM(penalty=self.penalty, C=self.c, max_iter=self.epoch, dual=dual) param = {'C': [10, 5, 2, 1, 0.5, 0.2, 0.1, 0.05, 0.02, 0.01]} self.classifier = GS(cv=5, estimator=SVM(penalty=self.penalty, dual=dual, max_iter=self.epoch), param_grid=param) elif self.model == 'R': # RandomGuess self.classifier = DC(strategy='stratified') else: logging.info('Unsupported model : %s', self.model) exit(0) self.classifier.fit(train_data[0], train_data[1]) self.classifier.predict(train_data[0]) predictions = self.classifier.predict(train_data[0]) acc = evaluator.accuracy_score(train_data[1], predictions) return acc
def OcToFr(data, CV, target, names): tf_transformer = TTransformer(use_idf=False).fit(data) X_train_tf = tf_transformer.transform(data) print(X_train_tf.shape) ttransformer = TTransformer() X_train_tfidf = ttransformer.fit_transform(data) Xtrain = X_train_tfidf clf = MNB().fit(Xtrain, target) docs_new = ['God is love', 'OpenGL in the GPU is fast'] X_new_counts = CV.transform(docs_new) X_new_tfidf = ttransformer.transform(X_new_counts) predicted = clf.predict(X_new_tfidf) for doc, category in zip(docs_new, predicted): print('%r => %s' % (doc, names[category]))
def MNBpredictor(X_train, y_train, X_test): ''' Input traning data ,target, and test data Output prabability of each label for test data''' from sklearn.naive_bayes import MultinomialNB as MNB # Cross validation may not be needed for random forest classifier model = MNB() model.fit(X_train, y_train) y_pred = model.predict(X_train) accuracy = metrics.accuracy_score(y_train, y_pred) logLoss = metrics.log_loss(y_train, y_pred) y_pred = model.predict(X_test) modelName = model.__class__.__name__ accModels[modelName] = accuracy predictions[modelName] = y_pred return y_pred, accuracy
def fit(self, max_features=1000): stemmer = SnowballStemmer('english') #names = list(self.shelf.keys()) # we redefined names to only include seen non-renewals names = set([ name for name, ren in zip(sample_df.insuredname, sample_df.renewal) if type(name) is str and name in seen_names and name in self.shelf and ren == 0 ]) t0 = time.time() train_cutoff = int(len(names) * 0.9) texts = [self.shelf[name]['results'] for name in names] train_texts = texts[:train_cutoff] test_texts = texts[train_cutoff:] ys = [int(self.shelf[name]['outcome']) for name in names] train_ys = ys[:train_cutoff] test_ys = ys[train_cutoff:] print("making train_tf") train_tf = self.vectorizer.fit_transform( [text for text in train_texts]) self.nb = MNB() self.nb.fit(train_tf.todense(), train_ys) train_yhats = self.nb.predict_proba(train_tf.todense())[:, 1] print("making test_tf") test_tf = self.vectorizer.fit_transform([text for text in test_texts]) test_yhats = self.nb.predict_proba(test_tf.todense())[:, 1] print("train AUROC:", roc_auc_score(train_ys, train_yhats)) print("test AUROC:", roc_auc_score(test_ys, test_yhats)) text_clf = Pipeline([ ('vect', CountVectorizer(max_features=10000, preprocessor=stemmer.stem)), ('tfidf', TfidfTransformer()), # ('clf', MNB()), ('clf', SGDClassifier(loss='log', penalty='elasticnet', alpha=0.00001)) ]) text_clf.fit(train_texts, train_ys) train_yhats = text_clf.predict_proba(train_texts)[:, 1] test_yhats = text_clf.predict_proba(test_texts)[:, 1] print("train AUROC:", roc_auc_score(train_ys, train_yhats)) print("test AUROC:", roc_auc_score(test_ys, test_yhats))
def naive_bayes(train_x, test_x, test, label): """朴素贝叶斯""" model_NB = MNB() # (alpha=1.0, class_prior=None, fit_prior=True) # 为了在预测的时候使用 model_NB.fit(train_x, label) print("多项式贝叶斯分类器10折交叉验证得分: \n", cross_val_score(model_NB, train_x, label, cv=10, scoring='roc_auc')) print( "多项式贝叶斯分类器10折交叉验证平均得分: ", np.mean( cross_val_score(model_NB, train_x, label, cv=10, scoring='roc_auc'))) test_predicted = np.array(model_NB.predict(test_x)) submission_df = pd.DataFrame(data={ 'id': test['id'], 'sentiment': test_predicted }) print("结果:") print(submission_df.head(100))
def OnRunModels(data): xtrain, xtest, ytrain, ytest = OnSplitData(data) knn = KNN(n_neighbors=10, metric='minkowski', p=2) svm = SVM.SVC(kernel='linear', C=1.0, gamma='auto') mnb = MNB() lreg = LREG(random_state=0) knn.fit(xtrain, ytrain) knn_preds = knn.predict(xtest) print(' KNN Results : ') print(' Classification Report') print(CREP(ytest, knn_preds)) print(' Confusion Matrix') print(CMAT(ytest, knn_preds), '\n\n') svm.fit(xtrain, ytrain) svm_preds = svm.predict(xtest) print(' SVM Results : ') print(' Classification Report') print(CREP(ytest, svm_preds)) print(' Confusion Matrix') print(CMAT(ytest, svm_preds), '\n\n') mnb.fit(xtrain, ytrain) mnb_preds = mnb.predict(xtest) print(' MNB Results : ') print(' Classification Report') print(CREP(ytest, mnb_preds)) print(' Confusion Matrix') print(CMAT(ytest, mnb_preds), '\n\n') lreg.fit(xtrain, ytrain) lreg_preds = lreg.predict(xtest) print(' LREG Results : ') print(' Classification Report') print(CREP(ytest, lreg_preds)) print(' Confusion Matrix') print(CMAT(ytest, lreg_preds), '\n\n')
def createClassifier(self, config): if self.classifier == "lr": return LogisticRegression(class_weight='balanced', penalty=config["penalty"], C=config["C"]) elif self.classifier == "gnb": return GaussianNB() elif self.classifier == "gp": return GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True) elif self.classifier == "mnb": return MNB(alpha=config["alpha"], fit_prior=config["fit_prior"]) elif self.classifier == "svm": return SVC(C=config["C"], kernel=config["kernel"], class_weight='balanced') elif self.classifier == "rf": return RFC(n_estimators=config["n_estimators"], class_weight='balanced') elif self.classifier == "dt": return DTC(criterion=config["criterion"], class_weight='balanced') elif self.classifier == "nbsvm": return NBSVM(C=config["C"], beta=config["beta"])
def check_alphas(X, y): u"""Takes in an X matrix and a Y array of labels. Checks five possible alpha values; returns the classifier with the highest cross-validated score.""" best = None best_score = None alphas = [1E-4, 1E-3, 1E-2, 1E-1, 1] for alpha in alphas: mnb = MNB(alpha) score = np.mean(cross_val_score(mnb, X, y, cv=10)) print "alpha: ", alpha, "score: ", score if not best: best = mnb best_score = score best_alpha = alpha elif score > best_score: best_score = score best = mnb best_alpha = alpha best.fit(X, y) print "our best score and our best alpha:" print best_score, best_alpha return best
def train(): X, y_train, voc = get_trainset() from sklearn.naive_bayes import MultinomialNB as MNB model_NB = MNB(alpha=1.0, class_prior=None, fit_prior=True) print 'Start Training!\n' start = time.time() model_NB.fit(X, y_train) end = time.time() f = open('model2.pickle', 'wb') f.write(pickle.dumps(model_NB)) f.close() print 'Finish Training!\n' from sklearn.cross_validation import cross_val_score import numpy as np print np.mean(cross_val_score(model_NB, X, y_train, cv=5, scoring='roc_auc')) f = open('voc_senti.pickle', 'wb') f.write(pickle.dumps(voc)) f.close() return voc
import numpy as np,pandas as pd,os,re,io,sys from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_selection import chi2 from sklearn.naive_bayes import MultinomialNB as MNB from sklearn.linear_model import LogisticRegression as LR from sklearn.ensemble import RandomForestClassifier as RFC from sklearn.svm import LinearSVC as LSVC models = [LSVC(), MNB(), LR(), RFC()] modelNames = ['Linear SVC', 'MultinomialNB','LogisticRegression','RandomForestClassifier'] stats_name = 'stats_ML_models.txt' ########################## TRAIN AND TEST ON MODELS ########################### ############################################################################### def perform_train_test(pd_df,dataName,f_obj,X,y): pd_df['cat_id']=pd.Series(pd_df[y].factorize()[0]).astype(int) id_label_map = dict(pd_df[['cat_id',y]].drop_duplicates().sort_values('cat_id').values) f_obj.write('Stats for dataset {}:\n'.format(dataName)) for (index,model) in enumerate(models): X_train,X_test,y_train,y_test=train_test_split(pd_df[X],pd_df['cat_id'],test_size=0.2,random_state=0) X_train_obj=TfidfVectorizer().fit(X_train) #returns TFIDF object FITTED TO TRAINING PARTITION X_train_tfidf=X_train_obj.transform(X_train) clf = model.fit(X_train_tfidf,y_train) X_test_tfidf=X_train_obj.transform(X_test) accuracy=clf.score(X_test_tfidf,y_test) pred_lab = [id_label_map[int(id_index)] for id_index in clf.predict(X_test_tfidf)] pred_df = pd.DataFrame(data={'Review Test':X_test, 'Label Test':pred_lab},columns=['Review Test','Label Test']) pred_df.to_csv('pred_{}_for{}.csv'.format(dataName,modelNames[index])) f_obj.write('\tAccuracy of model {} with data {}: {}\n'.format(modelNames[index],dataName,accuracy)) f_obj.write('\n')
tfv.fit(train_text) X_all = train_text + unlabeled_text +test_text len_train = len(train_text) len_unlabeled = len(unlabeled_text) X_all = tfv.transform(X_all) # 恢复成训练集和测试集部分 # 左闭右开 train_X = X_all[:len_train] unlabeled_X = X_all[len_train:len_train+len_unlabeled] test_X = X_all[len_train+len_unlabeled:] MNB(alpha=1.0, class_prior=None, fit_prior=True) ''' alpha : float,optional(默认值= 1.0) 拉普拉斯平滑参数(0表示无平滑)。 fit_prior : boolean,optional(default = True) 如果为假,则使用统一的先验。 class_prior : 可选(默认=无) 类的先验概率。如果指定,则不根据数据调整先验。 ''' model_NB = MNB() model_NB.fit(train_X, train_label) #特征数据直接灌进来 #使用测试集测试效果。输出信息供参考 print("predict") unlabeled_label = model_NB.predict(unlabeled_X) test_label = model_NB.predict(test_X)
quest = TextClassification(train_filename=args.train, test_filename=args.test, categories=[]) x_train, y_train = quest.readFileAndCut(quest.train_filename) x_test, y_test = quest.readFileAndCut(quest.test_filename) train_tfidf, test_tfidf = quest.train_tf(x_train, x_test) def execClassify(name, model): classifier = Classifier(name, model) classifier.fit(train_tfidf, y_train) classifier.test(y_test, test_tfidf) if args.m == 'mnb': from sklearn.naive_bayes import MultinomialNB as MNB execClassify(name='多项式朴素贝叶斯分类器', model=MNB()) elif args.m == 'bnb': from sklearn.naive_bayes import BernoulliNB as BNB execClassify(name='伯努利朴素贝叶斯分类器', model=BNB()) elif args.m == 'linearSVC': from sklearn.svm import LinearSVC execClassify(name='线性SVM分类器', model=LinearSVC()) elif args.m == 'dt': from sklearn.tree import DecisionTreeClassifier execClassify(name='决策树分类器', model=DecisionTreeClassifier()) elif args.m == 'knn': from sklearn.neighbors import KNeighborsClassifier execClassify(name='KNN分类器', model=KNeighborsClassifier()) elif args.m == 'xgb': import xgboost as xgb dtrain = xgb.DMatrix(train_tfidf, label=y_train)
return words if __name__ == "__main__": train = pd.read_csv(", ", header=0, delimiter="\t", quoting=3) test = pd.read_csv(", ", header=0, delimiter="\t", quoting=3) num_review = train["review"].size clean_train = [] for i in range(0, num_review): clean_train.append(review_to_words(train["review"][i])) vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=5000) tran_data_f = vectorizer.fit_transform(clean_train) tran_data_f = tran_data_f.toarray() model_NB = MNB() model_NB.fit(tran_data_f, train["sentiment"]) score = np.mean( cross_val_score(model_NB, tran_data_f, train["sentiment"], cv=20, scoring="roc_auc")) print("MultinomialNB score: %s" % score)
for i in range(len(line)): rows.append([ids[i], line[i]]) out = open(path.split('.')[0] + '.csv', 'a', newline='') csv_write = csv.writer(out, dialect='excel') csv_write.writerow(['id', 'sentiment']) for v in rows: csv_write.writerow(v) out.close() #利用朴素贝叶斯算法进行分类 from sklearn.naive_bayes import MultinomialNB as MNB label = train['sentiment'] MNBmodle = MNB(alpha=1.0, class_prior=None, fit_prior=True) svm_model = LinearSVC() #SVM knn = KNeighborsClassifier() #K邻近 mlp = MLPClassifier(hidden_layer_sizes=(30, 30, 30), activation='logistic', max_iter=100) #感知机 clf = tree.DecisionTreeClassifier(criterion='gini') print('train MNB') begin = datetime.datetime.now() MNBmodle.fit(train_x, label) end = datetime.datetime.now() k = end - begin print('MNB训练时长:', k.total_seconds()) predict_save(modle, 'MNB.json')
tfidf_vectorizer_pos = TfidfVectorizer() data_tfidf_text = tfidf_vectorizer_text.fit_transform(data['text']) data_tfidf_pos = tfidf_vectorizer_pos.fit_transform(data['posTags_string']) data_tfidf = hstack([data_tfidf_pos, data_tfidf_text]).toarray() data_tfidf = pd.DataFrame(data_tfidf) split = int(len(data)*0.75) y_train = data['isClickbait'][:split].values y_test = data['isClickbait'][split:].values X_train = data_tfidf[:split].values X_test = data_tfidf[split:].values svm = LinearSVC() mnb = MNB() lr = LR() rf = RFC(n_estimators = 100) models = {'Linear Support Vector': svm, 'Multinomial Naive Bayes': mnb, 'Logistic Regression': lr, 'Random Forest': rf} p = [] for n, m in models.items(): m.fit(X_train, y_train) predictions = m.predict(X_test) p.append(predictions) print('%s : %.3f' % (n, accuracy_score(y_test, predictions)))
filt = np.logical_not(y == '-99') #filt = np.logical_not(np.logical_or(y == 'Unknown', y == '-99')) #filt = np.logical_not(np.logical_or(np.logical_or(y == 'Unknown', y == '-99'), y == 'Various')) y = y[filt] #- Predictors X = pd.get_dummies(X, dummy_na=True) X = X[filt] le_y = preprocessing.LabelEncoder() y = le_y.fit_transform(y) #%% Initialize model clf = MNB() #%% Cross Validation using Stratisfied 10-Fold kf = RepeatedKFold(n_splits=10, n_repeats=10, random_state=0) scores = [] for train_idx, test_idx in kf.split(X, y): #print("TRAIN:", train_idx, "TEST:", test_idx) X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] y_train, y_test = y[train_idx], y[test_idx] model = clf.fit(X_train, y_train) predictions = model.predict(X_test) scores.append(accuracy_score(y_test, predictions)) print("Model training complete!")
from sklearn.datasets import load_svmlight_file as svmlight from sklearn.model_selection import cross_val_score from sklearn.naive_bayes import MultinomialNB as MNB, BernoulliNB as BNB from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC folderpath = r"D:\my_data" folder = "C:/Users/hanfs/Desktop/data-mining-project/training_data_file.TF.txt" #wondering if we should just hard code each path file but most likely because there is only 3 feature_vectors, targets = svmlight(folder) ###Lets Generate the Classifier items### print("TF Data") clf = MNB() scores = cross_val_score(clf, feature_vectors, targets, cv=5, scoring='f1_macro') print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) clf = BNB() scores = cross_val_score(clf, feature_vectors, targets, cv=5, scoring='f1_macro') print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
# THE SPARSITY CALCULATION sparsity = (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1])) print(sparsity) # TFIDF TRANSFORMATION tt = TT().fit(messages_bow) tfidf4 = tt.transform(bow4) print(tfidf4) # TFIDF TRANSFORMATION OF BOW TRANSFORMATION (THE COUNT VECTORIZED MESSAGES) messages_tfidf = tt.transform(messages_bow) # NAIVE BAYES spam_detect_model = MNB().fit(messages_tfidf,df['label']) pred4 = spam_detect_model.predict(tfidf4)[0] print(pred4) pred = spam_detect_model.predict(messages_tfidf) rate = np.mean(pred == df['label']) print("Rate: {}\n".format(rate)) # TRAIN AND TEST msg_train,msg_test,label_train,label_test = TTS(df['msg'],df['label'],test_size=0.3,random_state=64) # PIPELINE - A WAY TO STORE DATA PREPARATION PIPELINE pipe = Pipeline([ ('bow',CV(analyzer=text_process)), # COUNT VECTORIZER ('tfidf',TT()), # TFIDF TRANSFORMER
def main(): # 1. Data Preparation data = numpy.loadtxt('mod_data.txt') labels = numpy.loadtxt('mod_labels.txt') test_data = numpy.loadtxt('mod_test_data.txt') test_labels = numpy.loadtxt('mod_test_label.txt') test = numpy.column_stack((test_data, test_labels)) # 2. Create Sorted Data Vector by Label sorted_data = [[], [], [], [], [], [], [], [], [], []] idx = 0 for row in data: sorted_data[int(labels[idx]) - 1].append(row) idx += 1 sorted_data = numpy.asarray(sorted_data) # 3. Calculate Cluster Scores cluster_scores = [[], [], [], [], [], [], [], [], [], []] for idx in range(0, 10): cluster_scores[idx] = numpy.mean(sorted_data[idx], axis=0) cluster_scores = numpy.around(cluster_scores) # 4. Initial Training nb_classifier = MNB().fit(data, labels) # 5. Initial Enqueue of All New Agents new_queue = Queue.Queue(0) recycle_queue = Queue.Queue(0) for v in test: new_queue.put(v) # 6. Event Loop itr = 1 correct = 0 for i in range(0, 500): print "[Iteration %d]" % itr agent = None # 6-1 New Agent Queue if not new_queue.empty(): agent = new_queue.get() print("New Agent Dequeued") # 6-2 Recycled Agent Queue else: if not recycle_queue.empty(): agent = recycle_queue.get() print("Recycled Agent Dequeued") # 6-3 Random Event else: n = random.random() if (n < 0.01): print("Random Event!") time.sleep(2) continue # 6-4 Classification if agent is not None: result = nb_classifier.predict(agent[0:-1]) if int(result) == int(agent[-1]): correct += 1 idx = int(result) - 1 print "Label: %d" % int(result) # 6-5 Integrity Check (Tolerance to 10, Reject Threshold 10 ~ 1.7%) integrities = numpy.isclose(cluster_scores[idx], agent[0:-1], atol=50) accepted = True if numpy.bincount(integrities)[0] < 50 else False print "Accepted: %r" % accepted # 6-6 Re-train Classifier if accepted: # 6-6-1 Add Accepted Agent into Table sorted_data[idx].append(agent[0:-1]) # 6-6-2 Recalculate Cluster Scores cluster_scores[idx] = numpy.mean(sorted_data[idx], axis=0) cluster_scores[idx] = numpy.around(cluster_scores[idx], decimals=5) # 6-6-3 Recheck Integrity of Individual Agent index = 0 for row in sorted_data[idx]: row_integrities = numpy.isclose(cluster_scores[idx], row, atol=5) accepted = True if numpy.bincount( integrities)[0] < 5 else False if not accepted: recycle_queue.put(accepted) numpy.delete(sorted_data[idx], row) print "Agent %d in Cluster %d Rejected. Placed in recycle_queue" % ( index, idx) index += 1 # 6-6-4 Update Naive Bayes Classifier (Partial Fit) nb_classifier = nb_classifier.partial_fit(cluster_scores, numpy.arange(1, 11)) # 6-7 Enqueue in Recycled Queue else: recycle_queue.put(agent) print "New Agent Queue: %d\nRecycle Queue: %d\n" % ( new_queue.qsize(), recycle_queue.qsize()) itr += 1 print "Correct: %d" % (correct)