def train(cutoffs): print "\n========== Start Training ==========" if len(__TRAIN_DATA) == 3: list_io_addr = get_io_addr(__TRAIN_DATA[0], __TRAIN_DATA[1], __TRAIN_DATA[2]) else: list_io_addr = get_io_addr_random_sample(__TRAIN_DATA[0], __TRAIN_DATA[1]) clf = BernoulliNB(fit_prior=True) for i in range(len(list_io_addr)): path_in = list_io_addr[i] print "\nGenerating training set from {}".format(path_in) with open(path_in, "r") as file_in: X = Sparse_Matrix_IO.load_sparse_csr(file_in) if len(cutoffs) > 0: print "Discarding selected features......" X = discard_vars(X, cutoffs) vector_len = len(X[0]) X_train = X[:, 0:vector_len-1] y_train = X[:, vector_len-1] print "Done" # sm = SMOTE(ratio=0.9) # X_train_sm, y_train_sm = sm.fit_sample(X_train, y_train) print "Fitting Model......" clf.partial_fit(X_train, y_train, classes=[0, 1]) print "Done" with open(__ROOT_MODEL, "w") as file_out: pickle.dump(clf, file_out)
def train_nb(self, X, y): def map5eval(actual, preds): predicted = preds.argsort(axis=1)[:, -np.arange(5)] # print(predicted) metric = 0. for i in range(5): metric += np.sum(actual == predicted[:, i]) / (i + 1) metric /= actual.shape[0] return metric X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, stratify=y, test_size=0.2) map5 = make_scorer(map5eval, greater_is_better=True, needs_proba=True) clf = BernoulliNB(alpha=1.0) sw = 1 + 4 * X_train.is_booking clf.partial_fit(X_train, y_train, classes=np.arange(100), sample_weight=sw) score = cross_val_score(clf, X_train, y_train, cv=5, scoring=map5) print(score) return clf, X_test, y_test
def step4(): print('Training...') trainCsv = open('dataset-train.csv', 'r') X = np.zeros((chunk, lenDict)) if 'ber' in steps: model = BernoulliNB(alpha=smooth) elif 'mul' in steps: model = MultinomialNB(alpha=smooth) else: model = MultinomialNB(alpha=smooth) isDone = False COUNT = 0 while not isDone: ct = 0 Y = [] for line in trainCsv: vector = [int(i) for i in line.split(',')] X[ct] = vector[:-1] Y.append(vector[-1]) ct += 1 if ct == chunk: break if ct == 0: break if ct != chunk: X = np.resize(X, (ct, lenDict)) isDone = True model.partial_fit(X, Y, classes=[0, 1]) COUNT += ct print(COUNT) print('Testing on training set...') test_on_csv(model, 'dataset-train.csv') print('Testing on test set...') test_on_csv(model, 'dataset-test.csv')
def train_bnb_model(msg): msg_copy = msg.copy() msg_copy['train'] = True if not 'month' in msg_copy.keys(): msg_copy['month'] = msg_copy['train_month'] ret = dataset.get_data(msg_copy) input_data, output_data = ret[0:2] bnb = BernoulliNB(alpha=1e-2) bnb.partial_fit(input_data, output_data, classes=range(24)) return bnb
def trainBernoulliNB(X,y,loadweights): print("Training BernoulliNB...") BN_classifier = BernoulliNB() if loadweights: with open('weights/BernoulliNB.pickle', 'rb') as handle: BN_classifier = pickle.load(handle) for _ in range(10): BN_classifier.partial_fit(X,y,classes=[0,1]) with open('weights/BernoulliNB.pickle', 'wb') as handle: pickle.dump(BN_classifier, handle, protocol=pickle.HIGHEST_PROTOCOL) print (BN_classifier.score(X,y))
def train(cutoffs): print "\n========== Start Training ==========" if __DATA_FROM == 2: list_io_addr = get_io_addr(__TRAIN_DATA[0], __TRAIN_DATA[1]) else: list_io_addr = get_io_addr_random_sample(__TRAIN_DATA[0], __TRAIN_DATA[1]) clf = BernoulliNB(class_prior=[0.05, 0.95]) if __IF_TRAIN_WITHOUT_SAVE: print "Performing correlation explanation......" with open("/home/wlu/Desktop/day_samp_bin_1-2.npy", "r") as file_in: X = Sparse_Matrix_IO.load_sparse_csr(file_in) if len(cutoffs) > 0: X = discard_vars(X, cutoffs) layer = correlation_ex(X) for i in range(0, len(list_io_addr)): path_in = list_io_addr[i] print "\nGenerating training set from {}".format(path_in) with open(path_in, "r") as file_in: X = Sparse_Matrix_IO.load_sparse_csr(file_in) if len(cutoffs) > 0: X = discard_vars(X, cutoffs) vector_len = len(X[0]) X_train = X[:, 0:vector_len-1] y_train = X[:, vector_len-1] if __IF_TRAIN_WITHOUT_SAVE: print "Transforming training set according to CorEx......" X_train = corex_transform(layer, X_train) sm = SMOTE(ratio=0.95) X_train, y_train = sm.fit_sample(X_train, y_train) print "Fitting Model......" clf.partial_fit(X_train, y_train, classes=[0, 1]) print "Done" if __IF_TRAIN_WITHOUT_SAVE: return [clf, layer] else: with open(__ROOT_MODEL, "w") as file_out: pickle.dump(clf, file_out) return []
def main(pos_tweets_filename, neg_tweets_filename, tweets_count=float("inf"), pos_label="pos", neg_label="neg", chunk_size=10000, features_used=(1, 1)): machine = BernoulliNB() vectorizer = HashingVectorizer(ngram_range=features_used) chunk_features = [] # pos_features = extract_features(pos_tweets_filename, pos_label, tweets_count) # neg_features = extract_features(neg_tweets_filename, neg_label, tweets_count) # features = vectorizer.fit_transform(pos_features + neg_features) # labels = [pos_label, ] * len(pos_features) + [neg_label,] * len(neg_features) # machine.fit(features, labels) # return (machine, vectorizer, features.toarray()) for pos_features in extract_features(pos_tweets_filename, pos_label, tweets_count): chunk_features.append(pos_features) if len(chunk_features) >= chunk_size: features = vectorizer.fit_transform(chunk_features) machine.partial_fit(features, [ pos_label, ] * len(chunk_features), classes=[pos_label, neg_label]) chunk_features = [] if len(chunk_features) != 0: features = vectorizer.fit_transform(chunk_features) machine.partial_fit(features, [ pos_label, ] * len(chunk_features), classes=[pos_label, neg_label]) chunk_features = [] for neg_features in extract_features(neg_tweets_filename, neg_label, tweets_count): chunk_features.append(neg_features) if len(chunk_features) >= chunk_size: features = vectorizer.fit_transform(chunk_features) machine.partial_fit(features, [ neg_label, ] * len(chunk_features), classes=[pos_label, neg_label]) chunk_features = [] if len(chunk_features) != 0: features = vectorizer.fit_transform(chunk_features) machine.partial_fit(features, [ neg_label, ] * len(chunk_features), classes=[pos_label, neg_label]) chunk_features = [] return (machine, vectorizer)
def nb_onehot(): X_tr, Y_tr, X_va, Y_va, dictionary, X_te, id_list = util.create_or_load_data(freq_threshold=50) Y_te_pred_list = [] sum_auc_va = 0.0 for i in range(Y_tr.shape[1]): nb = BernoulliNB() j = 0 batch_size = 10000 while j < len(X_tr): end = min(j + batch_size, len(X_tr) - 1) batch = [data_process.seq2onehot(seq, dictionary) for seq in X_tr[j:end]] nb.partial_fit(batch, Y_tr[j:end, i], classes=[0, 1]) j += batch_size logging.info("Finish training") Y_va_pred = [] j = 0 while j < len(X_va): end = min(j + batch_size, len(X_va)) batch = [data_process.seq2onehot(seq, dictionary) for seq in X_va[j:end]] Y_va_pred.extend(nb.predict_proba(batch)) j += batch_size auc_va = util.auc(Y_va[:, i], Y_va_pred) logging.info("tag{}, valid auc: ".format(i) + str(auc_va)) sum_auc_va += auc_va Y_te_pred = [] j = 0 while j < len(X_te): end = min(j + batch_size, len(X_te)) batch = [data_process.seq2onehot(seq, dictionary) for seq in X_te[j:end]] Y_te_pred.extend(nb.predict_proba(batch)) j += batch_size Y_te_pred_list.append(Y_te_pred) logging.info("Avg auc: {}".format(sum_auc_va / Y_tr.shape[1])) util.submission(Y_te_pred_list, id_list)
def train(): print "\n========== Start Training ==========" list_io_addr = get_io_addr(__TRAIN_DATA) clf = BernoulliNB(class_prior=[0.1, 0.9], alpha=0.5) for addr_in in list_io_addr: print "\nGenerating training set from {}".format(addr_in) X_train, y_train = gd.get(addr_in, __RATIO) print "Done" print "Fitting Model......" clf.partial_fit(X_train, y_train, classes=[0, 1]) print "Done" if __SAVE_MODEL: with open(__ROOT_MODEL, "w") as file_out: pickle.dump(clf, file_out) return None else: return clf
def test_alpha(): # Setting alpha=0 should not output nan results when p(x_i|y_j)=0 is a case X = np.array([[1, 0], [1, 1]]) y = np.array([0, 1]) nb = BernoulliNB(alpha=0.0) msg = "alpha too small will result in numeric errors, setting alpha = 1.0e-10" with pytest.warns(UserWarning, match=msg): nb.partial_fit(X, y, classes=[0, 1]) with pytest.warns(UserWarning, match=msg): nb.fit(X, y) prob = np.array([[1, 0], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) nb = MultinomialNB(alpha=0.0) with pytest.warns(UserWarning, match=msg): nb.partial_fit(X, y, classes=[0, 1]) with pytest.warns(UserWarning, match=msg): nb.fit(X, y) prob = np.array([[2.0 / 3, 1.0 / 3], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) nb = CategoricalNB(alpha=0.0) with pytest.warns(UserWarning, match=msg): nb.fit(X, y) prob = np.array([[1.0, 0.0], [0.0, 1.0]]) assert_array_almost_equal(nb.predict_proba(X), prob) # Test sparse X X = scipy.sparse.csr_matrix(X) nb = BernoulliNB(alpha=0.0) with pytest.warns(UserWarning, match=msg): nb.fit(X, y) prob = np.array([[1, 0], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) nb = MultinomialNB(alpha=0.0) with pytest.warns(UserWarning, match=msg): nb.fit(X, y) prob = np.array([[2.0 / 3, 1.0 / 3], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob)
def train_bnb_model(msg): """ Trains a model using the given parameters month: int or list with the number of the month we want the data to be taken of input_columns: a list with the name of the columns we are going to use in the task use_product: bool, if true adds the product columns of the month before use_change: bool, if true adds the change columns of the month before """ msg_copy = msg.copy() msg_copy['train'] = True if not 'month' in msg_copy.keys(): msg_copy['month'] = msg_copy['train_month'] #Get the data for training ret = dataset.get_data(msg_copy) input_data, output_data = ret[0:2] #Fit the model bnb = BernoulliNB(alpha=1e-2) bnb.partial_fit(input_data, output_data, classes=range(24)) return bnb
def bern_naive_bayes(df): fail_df = df.copy(deep=True) pass_df = df.copy(deep=True) # Target values are G3 Y = df.pop("G3") Y_fail = fail_df.pop("G3") Y_pass = pass_df.pop("G3") # Feature set is remaining features X = df X_fail = fail_df X_pass = pass_df bnb = BernoulliNB() bnb.partial_fit(X_fail, Y_fail, [0, 1]) bnb.partial_fit(X_pass, Y_pass, [0, 1]) print("\n\nBernoulli Naive Bayes Accuracy: ", bnb.score(X, Y)) confuse(Y, bnb.predict(X)) return bnb
class Agent_NaiveBayes(Agent): #TODO. Currently only works with 1 other user. def __init__(self, loc, actspace, index, name = ''): super().__init__(loc, actspace, index, name) self.distribution = np.zeros((len(actspace), len(actspace))) #joint distribution (with probabilities) self.distribution_rewards = np.zeros((len(actspace), len(actspace))) #joint distribution (with rewards) self.model= BernoulliNB(); self.windowsize = 10; # Configurable self.predictions = np.zeros((self.windowsize, 1)) def updatereward(self, reward, Agents): super().updatereward(reward) # update join probabilities (list of who did what) myact = self.actions[-1] otheract = Agents[1-self.index].actions[-1] self.distribution_rewards[myact, otheract] = self.distribution_rewards[myact, otheract] + reward self.distribution[myact, otheract] = self.distribution[myact, otheract] + 1 #update model: OtherActions = Agents[1-self.index].actions; if len(OtherActions) > self.windowsize + 1: self.model.partial_fit(OtherActions[-self.windowsize-2:-2], [OtherActions[-1]], classes = self.actionspace) def act(self, BSs, variables,Agents, t): ## TODO write this code pexplore = 1-variables['p_explore']; if random.random() < pexplore and t > self.windowsize + 2: #exploit stage #predict what the other user will do others_predict = int(round(self.model.predict(Agents[1-self.index].actions[-self.windowsize-1:-1])[0])); np.insert(self.predictions, others_predict, len(self.predictions)) #find the action that maximizes assuming the the other does the predicted value avgOfEach = np.zeros(len(BSs)) for i in range(0, len(BSs)): if self.distribution[i,others_predict] > 0: avgOfEach[i] = self.distribution_rewards[i,others_predict]/self.distribution[i,others_predict] #choose action that maximizes expected action = avgOfEach.argmax() else: #explore stage action = random.randint(0, len(BSs)-1) self.actions.append(action) return action
def this_is_for_fun(): # clf = MultinomialNB(alpha=1.0) # clf = SGDClassifier(alpha=0.0001) # clf = PassiveAggressiveClassifier() clf = BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) # clf = Perceptron(alpha=0.001) print('BernoulliNB,a = 1') label_list = read_label() choose = random.randint(10, 24) corpus = corpora.MmCorpus('../corpus_mm/corpus_{}.mm'.format(choose)) test_X = matutils.corpus2csc(corpus).transpose() # 测试集 test_y = label_list[(choose * 20000):(choose + 1) * 20000] # 测试集标签 for index in range(10, 25): corpus = corpora.MmCorpus('../corpus_mm/corpus_{}.mm'.format(index)) csi_matrix = matutils.corpus2csc(corpus).transpose() clf.partial_fit(csi_matrix, label_list[(index * 20000):(index + 1) * 20000], classes=np.array([0, 1])) print("第{}次".format(index)) pre = clf.predict(test_X) totalScore(pre, test_y) joblib.dump(clf, "../model/BernoulliNB_model_{}.m".format(index))
inplace=True) XN = csr_matrix(chunk[num_col].values) X = csr_matrix((chunk.shape[0], n_features)) rows = np.arange(chunk.shape[0]) for col in cat_col_all: dat = np.ones(chunk.shape[0]) cols = chunk[col] % n_features X += csr_matrix((dat, (rows, cols)), shape=(chunk.shape[0], n_features)) X = hstack((XN, X)) book_indices = sw[sw > 1].index.tolist() X_test = csr_matrix(X)[book_indices] y_test = y[book_indices] clf.partial_fit(X, y, classes=np.arange(100), sample_weight=sw) count = count + chunksize map5 = map5eval(clf.predict_proba(X_test), y_test) print('%d rows completed. MAP@5: %f' % (count, map5)) if (count / chunksize == 200): break except Exception as e: print('Error: %s' % str(e)) pass #--------------------------对测试集结果进行预测---------------------------- with open('output/probs/bnb.pkl', 'wb') as f: pickle.dump(clf, f) count = 0 chunksize = 10000
val = model_preproc(val) # BernoulliNB classifier for categorical variables b_clf = BernoulliNB() # GaussianNB classifier for continous variables g_clf = GaussianNB() reader = pd.read_hdf(inputpath + 'train_proc_train.h5', key = 'df', mode = 'r', iterator = True, chunksize = chunksize_) for chunk in reader: # Shuffle the chunk chunk = chunk.sample(frac = 1) # Pre-process the chunk chunk = model_preproc(chunk) # Fit the BernoulliNB classifier b_clf.partial_fit(chunk[:, 10:], chunk[:, 0], classes = np.array([0, 1])) # Fit the GaissianNB classifier g_clf.partial_fit(chunk[:, 1:10], chunk[:, 0], classes = np.array([0, 1])) # Making predictions for the validation set b_probs = b_clf.predict_proba(val[:, 10:]) g_probs = g_clf.predict_proba(val[:, 1:10]) # Combining the probabolities from the two NB classifiers # Multiplying individual classifier class probabilities and normalizing using class priors probs = np.divide(np.multiply(b_probs, g_probs), g_clf.class_prior_) # Calculate the ROC-AUC val_auc = roc_auc_score(val[:, 0], probs[:, 1]) print('Validation set area under the ROC curve: %f' % val_auc) # Validation set area under the ROC curve: 0.683644
print(clf.predict_proba(Features)) import numpy as np X = np.random.randint(5, size=(6, 3)) y = np.array([1, 2, 3, 4, 5, 6]) from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB() clf.fit(X, y) MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) print(clf.predict_proba(X)) print(clf.predict_proba(np.array([3, 3, 2]))) XX = np.random.randint(5, size=(6, 3)) yy = np.array([1, 2, 3, 4, 5, 6]) clf.partial_fit(XX, yy) print(clf.predict_proba(X)) measurements = [ { 'city': 'Dubai', 'temperature': 33. }, { 'city': 'Dubai', 'temperature': 12. }, { 'city': 'San Fransisco', 'temperature': 18. },
sw = 1 + 4*chunk.is_booking chunk.drop(['cnt', 'hotel_cluster', 'is_booking'], axis=1, inplace=True) XN = csr_matrix(chunk[num_col].values) X = csr_matrix((chunk.shape[0], n_features)) rows = np.arange(chunk.shape[0]) for col in cat_col_all: dat = np.ones(chunk.shape[0]) cols = chunk[col] % n_features X += csr_matrix((dat, (rows, cols)), shape=(chunk.shape[0], n_features)) X = hstack((XN, X)) book_indices = sw[sw > 1].index.tolist() X_test = csr_matrix(X)[book_indices] y_test = y[book_indices] clf.partial_fit(X, y, classes=np.arange(100), sample_weight=sw) #len([i for i in clf.coef_[1] if i != 0]) #len([i for i in clf.coef_[1] if i > 0]) #jb = [col for h in np.argsort(abs(clf.coef_[5])) for col in chunk.columns if (hash(col) % n_features) == h] #preds += np.vstack(tuple([clf.predict_proba(test.loc[i*chunksize:min((i+1)*chunksize,test.shape[0]),:]) for i in range(int(test.shape[0]/100000))])) #preds += clf.predict_proba(test) count = count + chunksize map5 = map5eval(clf.predict_proba(X_test), y_test) print('%d rows completed. MAP@5: %f' % (count, map5)) if(count/chunksize == 200): break except Exception as e: #e = sys.exc_info()[0] print('Error: %s' % str(e))
print kobe_shots[kobe_shots['shot_type_2PT Field Goal']==1].count() score = [] shot_yn_2 = [] shot_yn_3 = [] bnb_2 = BernoulliNB() bnb_3 = BernoulliNB() for x, y, s in zip(kobe_x.iterrows(),kobe_y.iterrows(),kobe_shots.iterrows()): if s[1]['shot_type_2PT Field Goal'] == 1: if x[0]==0: print x, y, s op = [x[0]+1,0.5] score.append(op) else: if pd.notnull(y[1]['shot_made_flag']): bnb_2.partial_fit([x[1].tolist()],[y[1]['shot_made_flag']],classes=[0,1]) shot_yn_2.append(y[1]['shot_made_flag']) else: op = [x[0]+1,bnb_2.predict_proba(x[1])[0][1]] if x[0]%1000==0: print op score.append(op) else: if x[0]==0: print x, y, s op = [x[0]+1,0.5] score.append(op) else: if pd.notnull(y[1]['shot_made_flag']): bnb_3.partial_fit([x[1].tolist()],[y[1]['shot_made_flag']],classes=[0,1]) shot_yn_3.append(y[1]['shot_made_flag'])
m = len(X[0]) n = len(X) X_train = X[:, 0:m-1] y_train = X[:, m-1] print "Done" print sm = SMOTE(ratio=0.9) X_train_sm, y_train_sm = sm.fit_sample(X_train, y_train) print print "Fitting Model" clf.partial_fit(X_train_sm, y_train_sm, classes=[0, 1]) print "Done" print with open("/home/ubuntu/Weiyi/model_05_01", "w") as file_out: pickle.dump(clf, file_out) # with open("/home/wlu/Desktop/model_bernoulli", "r") as file_in: # clf = pickle.load(file_in) # # print "Generating the training set" # with open("/home/wlu/Desktop/test_sparse.npy", "r") as file_in: # X = load_sparse_csr(file_in).toarray() # # m = len(X[0]) # n = len(X)
def main(): ################################# # Possible values for i: # # 1: getMail() : preprocessing # # 2: generate dictionary.txt # # 3: getFeaturesMatrix() # # getFeaturesMatrixTest() # # getLabels() # # getLabelsTest() # # : feature vectors & labels # # 4: Naive Bayes Models # # 5: removeStopWords() # # 6: stemDictionary() # # 7: stemEmails() # ################################# i = 4 if i == 1: fileList = [] fileDir = "data/" for filename in os.listdir(fileDir): fileList.append(os.path.join(fileDir, filename)) #print(fileList) for filePath in fileList: inputname = filePath getMail(inputname) elif i == 2: fileList = [] fileDir = "preprocessed/" for filename in os.listdir(fileDir): fileList.append(os.path.join(fileDir, filename)) with open("dictionary.txt", "w") as outFile: for filePath in fileList: with open(filePath, "r") as textFile: for line in textFile: data = line.split(" ") for i in data: outFile.write(i) outFile.write("\n") textList = [] newList = [] with open("dictionary.txt", "r") as textFile: for line in textFile: textList.append(line) uWords = set(textList) for i in range(len(textList)): if textList[i].isalpha(): newList.append(textList[i]) with open("dictionary.txt", "w") as outFile: for i in range(len(newList)): outFile.write(newList[i]) outFile.write("\n") with open("dictionary.txt", "r") as textFile: lines = textFile.readLines() lines.sort() with open("dictionary.txt", "w") as outFile: for i in lines: outFile.write(i) elif i == 3: dictionary = [] with open("dictionary.txt", "r") as dicionaryText: for line in dicionaryText: word = line.split() dictionary += word trainFeatures = getFeaturesMatrix(dictionary) testFeatures = getFeaturesMatrixTest(dictionary) trainLabels = getLabels() testLabels = getLabelsTest() elif i == 4: aClass = np.array([0,1]) datasetTrain = np.genfromtxt("dataset-training.csv", dtype=np.int, delimiter=",",) datasetTest = np.genfromtxt("dataset-test.csv", dtype=np.int, delimiter=",") datasetTrainLabels = np.genfromtxt("dataset-training-labels.csv", dtype=np.int, delimiter=",") datasetTestLabels = np.genfromtxt("dataset-test-labels.csv", dtype=np.int, delimiter=",") modelM = MultinomialNB(alpha=1) modelB = BernoulliNB(alpha=1) splitTrain = datasetTrain.shape[0]//2 newTrainFeatures0, newTrainFeatures1 = datasetTrain[:splitTrain,:], datasetTrain[splitTrain:,:] newTrainLabels0 = datasetTrainLabels[0:splitTrain] newTrainLabels1 = datasetTrainLabels[splitTrain:] modelM.partial_fit(newTrainFeatures0, newTrainLabels0, classes=aClass) modelM.partial_fit(newTrainFeatures1, newTrainLabels1, classes=aClass) modelB.partial_fit(newTrainFeatures0, newTrainLabels0, classes=aClass) modelB.partial_fit(newTrainFeatures1, newTrainLabels1, classes=aClass) #modelM.fit(datasetTrain, datasetTrainLabels) #modelB.fit(datasetTrain, datasetTrainLabels) predictionsM = modelM.predict(datasetTest) predictionsM1 = modelM.predict(datasetTrain) predictionsB = modelB.predict(datasetTest) predictionsB1 = modelB.predict(datasetTrain) modelAccuracyM = accuracy_score(datasetTestLabels, predictionsM) modelAccuracyM1 = accuracy_score(datasetTrainLabels, predictionsM1) modelAccuracyB = accuracy_score(datasetTestLabels, predictionsB) modelAccuracyB1 = accuracy_score(datasetTrainLabels, predictionsB1) print("Multinomial Naive Bayes (Test):\t", modelAccuracyM) print("Multinomial Naive Bayes (Train):\t", modelAccuracyM1) print("Bernoulli Naive Bayes (Test):\t", modelAccuracyB) print("Bernoulli Naive Bayes (Train):\t", modelAccuracyB1) elif i == 5: removeStopWords() elif i == 6: stemDictionary() elif i == 7: stemEmails()
# for i in range(counts.size): # ratios[i] = counts[i]/sum(counts) # print('Ratios of classes', ratios*100) # # m = max(counts) # # for i in range(counts.size): # # ratios[i] = m/counts[i] # ratios = 1 - ratios # print('Weights to penalize loss function', ratios) trweights = np.zeros(len(X_train)) for i in range(len(y_train)): trweights[i] = cl_weights[y_train[i]] teweights = np.zeros(len(X_test)) for i in range(len(y_test)): teweights[i] = cl_weights[y_test[i]] clf1.partial_fit(X_train, y_train, sample_weight=trweights, classes=[0, 1, 2, 3]) pred = clf1.predict(X_test) accuracy = clf1.score(X_test, y_test) print('Training time: {:.2f}s'.format(time() - t1)) print('Test accuracy = {:.2f}'.format(accuracy * 100)) acc[s - 1] = accuracy * 100 cfmat = confusion_matrix(y_test, pred) print(cfmat) predicted += cfmat.sum(axis=0) expected += cfmat.transpose().sum(axis=0) # porter = Porter(clf1) # op = porter.export(export_data=True) # with open('{}.java'.format(model), 'w') as f: # f.write(op) # os.rename('data.json', '{}.json'.format(model))
#kobe_id = pd.read_csv('data/kobe_x_id.csv') kobe_x = pd.read_csv('data/kobe_x_transformed.csv') kobe_y = pd.read_csv('data/kobe_y.csv') score = [] shot_yn = [] bnb = BernoulliNB() new_scores = [] for x, y in zip(kobe_x.iterrows(), kobe_y.iterrows()): if x[0] == 0: op = [x[0] + 1, 0.5] score.append(op) else: if pd.notnull(y[1]['shot_made_flag']): bnb.partial_fit([x[1].tolist()], [y[1]['shot_made_flag']], classes=[0, 1]) shot_yn.append(y[1]['shot_made_flag']) else: if x[0] < 0: op = [x[0] + 1, float(sum(shot_yn)) / float(len(shot_yn))] else: op = [x[0] + 1, bnb.predict_proba(x[1])[0][1]] if x[0] % 1000 == 0: print op score.append(op) #print score[0:5] with open('data/attempt_2_output.csv', 'w') as f: f.write('shot_id,shot_made_flag' + '\n') f.writelines(str(s[0]) + ',' + str(s[1]) + '\n' for s in score)
l = ['month','TrainDay','testDay','recall','filtered'] wr.writerow(l) for diff in [1]: #1,7 # as for now, only [1] means test on next day for month in range(6,7): #5,7 # as for now, only range(6,7) means june for day in range(4,25): #1,32 # as for now, only range(4,5) means 1st day print '------------------------------------------------' print '------------------------------------------------' print 'month = ', month,' and day = ', day try: # Inputting training and testing set train_data, train_label = GetData(month, day) test_data, test_label = GetData(month, day+diff) print 'Data Read' #time.sleep(20) #sleep print 'Training Data...' clf.partial_fit(train_data, train_label, classes=[0, 1]) print 'Data Trained...' y_true = test_label n = len(y_true) ### Here's a problem... #print 'predictin jo ...' y_pred = clf.predict(test_data) #print 'getting conf matrix...' cf = confusion_matrix(y_true,y_pred) #print 'calculating recall...' recalll = recall_score(y_true, y_pred) #print 'calculating filtering' filtered = (cf[0,0])/float(n) print "Recall is: %s" % recalll print 'Filtering is = ', filtered print cf
# test subset contains a third of the original dataset, train contains the rest # Dataset is not shuffled before splitting train, test, train_labels, test_labels = train_test_split(features, labels, test_size=0.3, random_state=1, shuffle=False) print("Train shape:", train.shape) print("Train_labels shape:", train_labels.shape) print("Test shape:", test.shape) print("Test_labels shape:", test_labels.shape) """## Build classifier and evaluate performance""" # Initialize our classifier # Use Bernoulli distribution as only 2 outputs remain after stripping unlabeled data # either malignant or benign gnb = BernoulliNB() # Train our classifier for i in range(0, len(train), len(train) // 4): train_subset = train[i:i + len(train) // 4] train_labels_subset = train_labels[i:i + len(train) // 4] gnb.partial_fit(train_subset, train_labels_subset, np.unique(train_labels_subset)) # Make predictions preds = gnb.predict(test) # Evaluate accuracy print("Accuracy :", accuracy_score(test_labels, preds))
y = data.target # Vectorize the movie reviews using our 8 words vect = CountVectorizer(vocabulary=["awful", "bad", "boring", "dull", "effective", "enjoyable", "great", "hilarious"]) X = vect.fit_transform(data.data) X = X.toarray() # Define our classifier and cross-validation clf = BernoulliNB(binarize=True) kf = KFold(n_splits=10, shuffle=True) kf.get_n_splits(X) # Perform cross-validation score = 0 for k, (train, test) in enumerate(kf.split(X, y)): clf.partial_fit(X[train], y[train], [0,1]) score += clf.score(X[test], y[test]) # Calculate average prediction accuracy score = score / 10 print("Bernoulli Average Score: {0:.5f}".format(score)) # Define our classifier and cross-validation clf = MultinomialNB() kf = KFold(n_splits=10, shuffle=True) kf.get_n_splits(X) # Perform cross-validation score = 0 for k, (train, test) in enumerate(kf.split(X, y)): clf.partial_fit(X[train], y[train], [0,1])
def test_alpha(): # Setting alpha=0 should not output nan results when p(x_i|y_j)=0 is a case X = np.array([[1, 0], [1, 1]]) y = np.array([0, 1]) nb = BernoulliNB(alpha=0.0) msg = "alpha too small will result in numeric errors, setting alpha = 1.0e-10" with pytest.warns(UserWarning, match=msg): nb.partial_fit(X, y, classes=[0, 1]) with pytest.warns(UserWarning, match=msg): nb.fit(X, y) prob = np.array([[1, 0], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) nb = MultinomialNB(alpha=0.0) with pytest.warns(UserWarning, match=msg): nb.partial_fit(X, y, classes=[0, 1]) with pytest.warns(UserWarning, match=msg): nb.fit(X, y) prob = np.array([[2.0 / 3, 1.0 / 3], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) nb = CategoricalNB(alpha=0.0) with pytest.warns(UserWarning, match=msg): nb.fit(X, y) prob = np.array([[1.0, 0.0], [0.0, 1.0]]) assert_array_almost_equal(nb.predict_proba(X), prob) # Test sparse X X = scipy.sparse.csr_matrix(X) nb = BernoulliNB(alpha=0.0) with pytest.warns(UserWarning, match=msg): nb.fit(X, y) prob = np.array([[1, 0], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) nb = MultinomialNB(alpha=0.0) with pytest.warns(UserWarning, match=msg): nb.fit(X, y) prob = np.array([[2.0 / 3, 1.0 / 3], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) # Test for alpha < 0 X = np.array([[1, 0], [1, 1]]) y = np.array([0, 1]) expected_msg = re.escape( "Smoothing parameter alpha = -1.0e-01. alpha should be > 0." ) b_nb = BernoulliNB(alpha=-0.1) m_nb = MultinomialNB(alpha=-0.1) c_nb = CategoricalNB(alpha=-0.1) with pytest.raises(ValueError, match=expected_msg): b_nb.fit(X, y) with pytest.raises(ValueError, match=expected_msg): m_nb.fit(X, y) with pytest.raises(ValueError, match=expected_msg): c_nb.fit(X, y) b_nb = BernoulliNB(alpha=-0.1) m_nb = MultinomialNB(alpha=-0.1) with pytest.raises(ValueError, match=expected_msg): b_nb.partial_fit(X, y, classes=[0, 1]) with pytest.raises(ValueError, match=expected_msg): m_nb.partial_fit(X, y, classes=[0, 1])
#clf = MultinomialNB() #clf.fit(SP_train.iloc[:, 1:23], SP_train.iloc[:, 0]) #MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) yTest_predNB = clf.predict(SP_test.iloc[:, 1:23]) print(clf.predict(SP_test.iloc[:, 1:23])) tn, fp, fn, tp = confusion_matrix(SP_test.iloc[:, 0], yTest_predNB).ravel() AR = (tn+tp)/(tn+fp+fn+tp) Sens = tp/(tp+fn) Spec = tn/(tn+fp) print("""The accuracy rate is %.3f, the sensitivity is %.3f, the specificity is %.3f""" %(AR, Sens, Spec)) #%% clf1 = BernoulliNB() clf1.partial_fit(SP_train.iloc[:, 1:23], SP_train.iloc[:, 0], np.array([0, 1])) BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) yTest_predNB1 = clf.predict(SP_test.iloc[:, 1:23]) print(clf1.predict(SP_test.iloc[:, 1:23])) tn, fp, fn, tp = confusion_matrix(SP_test.iloc[:, 0], yTest_predNB1).ravel() AR = (tn+tp)/(tn+fp+fn+tp) Sens = tp/(tp+fn) Spec = tn/(tn+fp) print("""The accuracy rate is %.3f, the sensitivity is %.3f, the specificity is %.3f""" %(AR, Sens, Spec))