def train(self, x_supervised, x_unsupervised, y_supervised): """ train the modified Naive bayes classifier using both labelled and unlabelled data. We use the CountVectorizer vectorizaton method from scikit-learn positional arguments:! -- X_supervised: [N_sup, in_features] -- X_unsupervised: [N_unsup, in_features] -- y_supervised: [N_sup, out_class] """ # clf = GaussianNB() clf = BernoulliNB() clf.fit(x_supervised, y_supervised) predi = clf.predict(x_supervised) old_likelihood = 1 while self.max_rounds > 0: self.max_rounds -= 1 # E-step predi = clf.predict(x_unsupervised) # M-step clf.fit(x_unsupervised, predi) # calculate new total likelihood predi = clf.predict(x_supervised) unsupervised_log_matrix = clf._joint_log_likelihood(x_unsupervised) supervised_log_matrix = clf._joint_log_likelihood(x_supervised) # print("unsupervised_log_matrix before log", unsupervised_log_matrix) total_likelihood = self.get_log_likelihood(unsupervised_log_matrix, supervised_log_matrix, y_supervised) # print("total likelihood: {}".format(total_likelihood)) if self._stopping_time(old_likelihood, total_likelihood): break old_likelihood = total_likelihood.copy() self.clf = clf
ys = pickle.load(open('binarized_ys.pkl', 'rb')) print("Done.") for i in range(0,2): x_train, x_test, y_train, y_test = train_test_split(Xs[i], ys[i], test_size=1./3, random_state=3330) features=len(x_train) objects=len(y_train) clf = BernoulliNB(alpha=0, binarize=0.0 , class_prior=None , fit_prior=True ) clf = clf.fit(x_train, y_train) a= clf._joint_log_likelihood(x_train) print("joint log likelyhood train") print(a) res=[] for i in range(0,objects): j=0 res.append([a[i][j]/(a[i][j]+a[i][j+1]),a[i][j+1]/(a[i][j]+a[i][j+1])]) sum=0 for i in range(0,objects): if y_train[1].__eq__(False): sum += res[i][0] else: sum += res[i][1];
#print("Alpha LIST : " ,alphaVal ) # For 10 datasets for e in range(0, 10): X_train, X_test, y_train, y_test = train_test_split( Xs[e], ys[e], test_size=1. / 3, random_state=6099) # A20396099 # For 15 alpha values for alp in range(0, 15): # BernoulliNB classifier clf = BernoulliNB(alpha=alphaVal[alp], binarize=0.0, fit_prior=True, class_prior=None) # fitting model on train data clf.fit(X_train, y_train) # prediction for train data using jll predict_train = clf._joint_log_likelihood(X_train) # prediction for test data using jll predict_test = clf._joint_log_likelihood(X_test) log_train, log_test = 0, 0 # print("Train : ", log_train) # print("Train : ", log_train) # print("Predict Train jll: ", predict_train) # print("Predict Test jll: ", predict_test) # summing test predections for test in range(len(predict_test)): if y_test[test] == True: log_test += predict_test[test][1] else: log_test += predict_test[test][0] # summing train predections for train in range(len(predict_train)):
test_jll = np.zeros((10, 15)) for i in range(0, 10): idx = 0 # Split datasets x_train, x_test, y_train, y_test = train_test_split(Xs[i], ys[i], test_size=1. / 3, random_state=7000) for j in alphas: # 1. Create new Bernoulli Naive Bayes model using alpha value mod = BernoulliNB(alpha=j) # Fit the model to the training set mod.fit(x_train, y_train) # Compute the joint log likelihood for the training set, store it train_jll 2d array total_res = mod._joint_log_likelihood(x_train) y_train_binary = y_train * 1 entry_val = 0 # Sum-up by matching true labels for k in range(0, len(y_train)): entry_val += total_res[k][y_train_binary[k]] # Store result train_jll[i][idx] = entry_val # 2. Compute the joint log likelihood for the testing set, store it test_jll 2d array total_res = mod._joint_log_likelihood(x_test) y_test_binary = y_test * 1 entry_val = 0 # Sum-up by matching true labels for k in range(0, len(y_test)): entry_val += total_res[k][y_test_binary[k]] test_jll[i][idx] = entry_val
distribution = [] for i in range(-7, 8): distribution.append(10**i) for i in range(10): X_train, X_test, y_train, y_test = train_test_split(Xs[i], ys[i], test_size=1. / 3, random_state=4435) for j in range(15): classifier = BernoulliNB(alpha=distribution[j]) classifier.fit(X_train, y_train) train_Y_score = classifier._joint_log_likelihood(X_train) individual_joint_likelihood = 0.0 for k in range(0, len(y_train)): if y_train[k] == True: individual_joint_likelihood += train_Y_score[k][1] else: individual_joint_likelihood += train_Y_score[k][0] train_joint_likelihood[i][j] = individual_joint_likelihood test_Y_score = classifier._joint_log_likelihood(X_test) individual_joint_likelihood = 0.0 for k in range(0, len(y_test)): if y_test[k] == True: individual_joint_likelihood += test_Y_score[k][1] else:
from sklearn.naive_bayes import BernoulliNB Xs = pickle.load(open('binarized_xs.pkl', 'rb')) ys = pickle.load(open('binarized_ys.pkl', 'rb')) train_jll = np.zeros((10, 15)) test_jll = np.zeros((10, 15)) for i_dataset in range(10): X, y = Xs[i_dataset], ys[i_dataset] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1. / 3, random_state=1527) y_train_indices = [0 if i == False else 1 for i in y_train] y_test_indices = [0 if i == False else 1 for i in y_test] for i_alpha in range(-7, 8): clf = BernoulliNB(alpha=10**i_alpha) clf.fit(X_train, y_train) sum_train_jll, sum_test_jll = 0, 0 for i in range(len(y_train)): sum_train_jll += clf._joint_log_likelihood(X_train)[i][ y_train_indices[i]] for i in range(len(y_test)): sum_test_jll += clf._joint_log_likelihood(X_test)[i][ y_test_indices[i]] train_jll[i_dataset][i_alpha + 7] = sum_train_jll test_jll[i_dataset][i_alpha + 7] = sum_test_jll pickle.dump((train_jll, test_jll), open('result.pkl', 'wb'))
train_jll = np.zeros((10, 15)) test_jll = np.zeros((10, 15)) # Anumber A20406657 for i in range(len(Xs)): X_train, X_test, y_train, y_test = train_test_split( Xs[i], ys[i], test_size=1. / 3, random_state=int("6657")) #print(X_train) for j in range(len(alpha_list)): sum_1 = 0 sum_2 = 0 clf = BernoulliNB(alpha=alpha_list[j], binarize=0.0, class_prior=None, fit_prior=True) clf.fit(X_train, y_train) joint_X_train = clf._joint_log_likelihood(X_train) joint_X_test = clf._joint_log_likelihood(X_test) for k in range(0, len(joint_X_train)): if y_train[k] == True: sum_1 += joint_X_train[k][1] else: sum_1 += joint_X_train[k][0] #print(y_train[k]) for m in range(0, len(joint_X_test)): if y_test[m] == True: sum_2 += joint_X_test[m][1] else: sum_2 += joint_X_test[m][0] train_jll[i][j] = sum_1 test_jll[i][j] = sum_2