def predict(self, X_test): """Makes predictions for test instances in X_test. Args: X_test(list of list of obj): The list of testing samples The shape of X_test is (n_test_samples, n_features) Returns: y_predicted(list of obj): The predicted target y values (parallel to X_test) Predict given X, find C such that P(C|X) is greatest 1. Calculate all P(Ci|X) 2. compare them """ #calc P(X|C) by multiplying every corresponding posterior for each col val c_list = myutils.get_col_byindex(self.priors, 0) y_predicted = [] #loop through x_test rows for row in X_test: # each x_test is a row with specific values. Compute the total prob for each possible class label #all_p_cx = [] : list to hold all P(C|X) to compare all_p_cx = [] #loop through each class label for c_list_index in range(len(c_list)): #p_cx: this is P(C|X) p_cx = 0 #loop through each val in the row for curr_val_index in range(len(row)): ##find the posterior for that val in that col #loop through self.posteriors post_found = False for posteriors in self.posteriors: if post_found == True: break #if curr_col_index == self.posteriors[curr_index:A][0] if curr_val_index == posteriors[0]: post_found = True #loop through self.posteriors[A] for i in range(len(posteriors)): if i == 0: continue #if posterior.class == curr class label C if str(posteriors[i][0]) == str( c_list[c_list_index]): #loop through the list with that C for j in range(len(posteriors[i])): #if posterior.val == given attr Val if j == 0: continue if str(posteriors[i][j][0]) == str( row[curr_val_index]): p = posteriors[i][j][1] if p_cx == 0: p_cx = 1 p_cx = p_cx * p break p_cx = p_cx * self.priors[c_list_index][1] #append p_cx to all_p_cx all_p_cx.append(p_cx) #compare each p_cx from that list and find the index of max best_p_index = all_p_cx.index(max(all_p_cx)) #append the class label with corresponding index to y_predicted y_predicted.append(c_list[best_p_index]) return y_predicted
def fit(self, X_train, y_train): """Fits a Naive Bayes classifier to X_train and y_train. Args: X_train(list of list of obj): The list of training instances (samples). The shape of X_train is (n_train_samples, n_features) y_train(list of obj): The target y values (parallel to X_train) The shape of y_train is n_train_samples Notes: Since Naive Bayes is an eager learning algorithm, this method computes the prior probabilities and the posterior probabilities for the training data. You are free to choose the most appropriate data structures for storing the priors and posteriors. Priors 1. Probability of C (label) P(C) = Total/NumOfInsancesOfC ex: #C/Total 2. Probability of X, (instance/row) Posteriors 1. Probability of row given class label use independence assumption P(X|C) = P(V1 and C) * P(V2 and C) *...etc P(V|C) = (#C&V/TotalLenTable)/P(C) **only for categorical 2. Probability of class label given row P(C|X) = P(X|C)*P(C) """ #priors #get each class self.priors = [] self.posteriors = [] if isinstance(y_train[0], int): c_list, counts = myutils.get_freq_1col(y_train) else: c_list, counts = myutils.get_freq_str(y_train) #create list of priors objects, [label, probability], add to priors for i in range(len(c_list)): p = counts[i] / len(y_train) prior = [c_list[i], p] self.priors.append(prior) #posteriors #calculate probability of V and C for every possible V for each col (excluding c col) #loop through each col for i in range(len(X_train[0])): col = myutils.get_col_byindex(X_train, i) #############check if values in col are categorical or coninuous #get a list of every possible value and their counts (get_freq) val_list, counts = myutils.get_freq_str(col) #create list of posterior objects, [value, probability], add to this col's posteriors list #create list to hold all posteriors for col col_posteriors = [i] #loop through each C for c_index in range(len(c_list)): #create list to hold P(V|C)'s for this class posteriors = [c_list[c_index]] #loop through each V for V in val_list: # create var to hold the count for number of rows that are C&V count = 0 #loop through each row for j in range(len(X_train)): #if C&V then count++ if str(X_train[j][i]) == str(V) and str( y_train[j]) == str(c_list[c_index]): count += 1 # calc P(V|C) = count/Total#Rows p = count / len(y_train) p = p / self.priors[c_index][1] # make [V_name, P] obj posterior = [V, p] #append obj to list of P(V|C)'s for this class posteriors.append(posterior) col_posteriors.append(posteriors) #append col_posteriors, [col_index, [class_label, [val_name, P] ] ], to self.posteriors self.posteriors.append(col_posteriors) pass # TODO: fix this