def set_test_dists(self, test_fnames): # initialize empty histograms # since one histogram/pdf is computed for each element of test set # as below, it needs to be initialized at every testing for c in preferences.CLASSES: self.test_histograms[c], self.test_pdfs[c] = {}, {} for test_fname in test_fnames[c]: self.test_histograms[c][test_fname], self.test_pdfs[c][test_fname] = {}, {} for o in preferences.OBSERVABLES: self.test_histograms[c][test_fname][o] = data_tools.initialize_histogram(o) self.test_pdfs[c][test_fname][o] = [] # compute histograms for each class (using test set) for c in preferences.CLASSES: for test_fname in test_fnames[c]: data = np.load(test_fname) data_A, data_B = data_tools.extract_individual_data(data) obs_data = data_tools.compute_observables(data_A, data_B) for o in preferences.OBSERVABLES: self.test_histograms[c][test_fname][o] = data_tools.compute_histogram_1D(o, obs_data[o]) for c in preferences.CLASSES: for test_fname in test_fnames[c]: for o in preferences.OBSERVABLES: self.test_pdfs[c][test_fname][o] = data_tools.compute_pdf(o, self.test_histograms[c][test_fname][o])
def set_train_dists(self, train_fnames): # initialize empty histograms # since histogram is accumulated as below, it needs to be initialized # at every training for c in preferences.CLASSES: self.train_histograms[c] = {} self.train_pdfs[c] = {} for o in preferences.OBSERVABLES: self.train_histograms[c][o] = data_tools.initialize_histogram(o) # compute histograms for each class (using training set) for c in preferences.CLASSES: for train_fname in train_fnames[c]: data = np.load(train_fname) data_A, data_B = data_tools.extract_individual_data(data) obs_data = data_tools.compute_observables(data_A, data_B) for o in preferences.OBSERVABLES: self.train_histograms[c][o] += data_tools.compute_histogram_1D(o, obs_data[o]) for c in preferences.CLASSES: for o in preferences.OBSERVABLES: self.train_pdfs[c][o] = data_tools.compute_pdf(o, self.train_histograms[c][o])
def train(self, train_fnames): train_histograms1D = {} # initialize empty histograms for o in preferences.OBSERVABLES: train_histograms1D[o], self.train_pdfs1D[o] = {}, {} for c in preferences.CLASSES: train_histograms1D[o][c] = data_tools.initialize_histogram(o) # compute histograms for each class for c in preferences.CLASSES: for file_path in train_fnames[c]: data = np.load(file_path) data_A, data_B = data_tools.extract_individual_data(data) obs_data = data_tools.compute_observables(data_A, data_B) for o in preferences.OBSERVABLES: train_histograms1D[o][ c] += data_tools.compute_histogram_1D(o, obs_data[o]) for o in preferences.OBSERVABLES: for c in preferences.CLASSES: self.train_pdfs1D[o][c] = data_tools.compute_pdf( o, train_histograms1D[o][c])
if __name__ == "__main__": start_time = time.time() data_fnames = file_tools.get_data_fnames('../data/gender_compositions/') histograms1D = {} pdfs1D = {} # initialize empty histograms for o in preferences.OBSERVABLES: histograms1D[o], pdfs1D[o] = {}, {} for c in preferences.CLASSES_RAW: histograms1D[o][c] = data_tools.initialize_histogram(o) # compute histograms for each class for c in preferences.CLASSES_RAW: for file_path in data_fnames[c]: data = np.load(file_path) data_A, data_B = data_tools.extract_individual_data(data) obs_data = data_tools.compute_observables(data_A, data_B) for o in preferences.OBSERVABLES: histograms1D[o][c] += data_tools.compute_histogram_1D(o, obs_data[o]) for o in preferences.OBSERVABLES: for c in preferences.CLASSES_RAW: pdfs1D[o][c] = data_tools.compute_pdf(o, histograms1D[o][c]) plot_pdf(pdfs1D) elapsed_time = time.time() - start_time print('\nTime elapsed %2.2f sec' %elapsed_time)
def estimate(self, alpha, test_fnames): """ Performance is evaluated in various ways. Below I explain each of these on a toy example. Lets assume we have a dyad whic nis annotated to be Doryo (D) and its trajectory of length 8. Suppose the post probabilities are found as follows: time = t [K D Y Kz ] time = 0 [0.45 0.20 0.10 0.25] time = 1 [0.20 0.10 0.45 0.25] time = 2 [0.45 0.20 0.10 0.25] time = 3 [0.20 0.45 0.10 0.25] time = 4 [0.45 0.20 0.10 0.25] time = 5 [0.25 0.20 0.10 0.45] time = 6 [0.45 0.20 0.10 0.25] time = 7 [0.20 0.45 0.10 0.25] Here, each vector involves (post) probabilities for koibito (K), doryo (D), yujin (Y), and kazoku (Kz), respectively. ----------------------------------------------------------------------- trajectory-based decision: yields a single estimation of social relation for each dyad. TODO ----------------------------------------------------------------------- binary_by_event For each trajectory data point (event), we make an instantaneous decision by choosing the social relation class with highest post probability. Then the instantaneous decisions will be: [K Y K D K Kz K D] We build a confusion matrix such that the rows represent the true class and the columns represent the assigned class. The above dyad contributes to the confusion matrix (before scaling) as follows: [K D Y Kz ] | 0 0 0 0 | old_mats_before_scaling + | 4 2 1 1 | | 0 0 0 0 | | 0 0 0 0 | After processing all dyads, each row is scaled to 1. ----------------------------------------------------------------------- binary_by_trajectory_voting According to this assessment method, for each dyad we pick the social relation class with highest number of votes among all trajectory data points. So eventually the dyad gets a single estimation (discrete output) For the above case, the votes are as follows: K = 4 D = 2 Y = 1 Kz= 1 So the estimated class will be K. If this decision is correct, it will give a 1, otherwise a 0. Actually in the confusion matrix, I also store the exact mistakes (off-diagonal). The above dyad contributes to the confusion matrix (before scaling) as follows: [K D Y Kz ] | 0 0 0 0 | old_mats_before_scaling + | 1 0 0 0 | | 0 0 0 0 | | 0 0 0 0 | After processing all dyads, each row is scaled to 1. ----------------------------------------------------------------------- binary_by_trajectory_probability For every dyad, we take the average probability concering each social relation class over all trajectory data points. The class with the highest probability is the esmtimated class. For the above case, the average probabilities are: K = 0.33125 D = 0.25 Y = 0.14375 Kz= 0.275 So the decision will be K. The above dyad contributes to the confusion matrix (before scaling) as follows: [K D Y Kz ] | 0 0 0 0 | old_mats_before_scaling + | 1 0 0 0 | | 0 0 0 0 | | 0 0 0 0 | After processing all dyads, each row is scaled to 1. ----------------------------------------------------------------------- probabilistic_by_event This assessment method yields two outcomes: 1. Confusion matrix 2. Confidence values 1. Confusion matrix: I accumulate the post probabilites derived from each trajectory data point of each dyad, in a confusion matrix. The above dyad contributes to the confusion matrix (before scaling) as follows: | 0 0 0 0 | old_mats_before_scaling +| 0.45 0.20 0.10 0.25| + | 0 0 0 0 | | 0 0 0 0 | | 0 0 0 0 | | 0 0 0 0 | | 0.20 0.10 0.45 0.25| + | 0.45 0.20 0.10 0.25| + ... | 0 0 0 0 | | 0 0 0 0 | | 0 0 0 0 | | 0 0 0 0 | | 0 0 0 0 | | 0.20 0.45 0.10 0.25| | 0 0 0 0 | | 0 0 0 0 | After processing all dyads, each row is scaled to 1. 2. Confidence values: The confidence is defined as below: conf = 100 - abs(p_max - p_gt) Here p_max is the highest probability (among the probabiities associated with each possible outcome (ie class)). On the other hand, p_gt is the probability that is associated with the gt class. I compute confidence at each single observation point (ie trajectory point) I do not store all these values. Instead, I store only the variables necessary to compute statistics. Namely: the number of observations the sum confidence values the sum of squares of confidence values For the above example, the confidence values will be time = 1 - (0.45 - 0.20 ) = 0.75 time = 1 - (0.45 - 0.10 ) = 0.65 time = 1 - (0.45 - 0.20 ) = 0.75 time = 1 - (0.45 - 0.45 ) = 1 time = 1 - (0.45 - 0.20 ) = 0.75 time = 1 - (0.45 - 0.20 ) = 0.75 time = 1 - (0.45 - 0.20 ) = 0.75 time = 1 - (0.45 - 0.45 ) = 1 and I will update the stored values as follows: number_of_observations += 8 sum_confidence_values += 0.75 + 0.65 + 0.75 + 1 + 0.75 + 0.75 + 0.75 + 1 sum_of_squares_of_confidence_values = 0.75**2 + 0.65**2 + 0.75**2 + 1**2 + 0.75**2 + 0.75**2 + 0.75**2 + 1**2 ----------------------------------------------------------------------- probabilistic_by_trajectory This assessment method yields two outcomes: 1. Confusion matrix 2. Confidence values 1. Confusion matrix: Remember from binary_by_trajectory_probability that we computed the average probability concering each social relation class over all trajectory data points. For the above case, the average probabilities were: K = 0.33125 D = 0.25 Y = 0.14375 Kz= 0.275 The above dyad contributes to the confusion matrix (before scaling) as follows: | 0 0 0 0 | old_mats_before_scaling +| 0.33125 0.25 0.14375 0.275| | 0 0 0 0 | | 0 0 0 0 | After processing all dyads, each row is scaled to 1. 2. Confidence values: I compute the confidece on the above probability vector: confidence = 1 - (0.33125 - 0.25) = 0.91875 and I update the stored values as follows: number_of_observations += 1 sum_confidence_values += 0.91875 sum_of_squares_of_confidence_values = 0.91875**2 ----------------------------------------------------------------------- empirical_probability_by_trajectory This assessment method yields two outcomes: 1. Confusion matrix 2. Confidence values 1. Confusion matrix: Here, I first derive a probability vector from the votes given at each data point. For the above case, the votes were as follows: K = 4 D = 2 Y = 1 Kz= 1 So I build a probability vector as: [K D Y Kz ] [4/8 2/8 1/8 1/8] Then this vector is added to the confusion matrix as below | 0 0 0 0 | old_mats_before_scaling +| 0.50 0.25 0.125 0.125| | 0 0 0 0 | | 0 0 0 0 | After processing all dyads, each row is scaled to 1. 2. Confidence values: The confidence is defined as below: conf = 100 - abs(p_max - p_gt) This time p_max is the highest value in the above vector. Similar to the previous case, p_gt is the probability associated with the gt class. For the above case, since the gt class is given as D, conf will be: conf = 1 - abs(0.50 - 0.25) = 0.75 I will update the stored values as follows: number_of_observations += 1 sum_confidence_values += 0.75 sum_of_squares_of_confidence_values += 0.75**2 Same as before, values close to 1 indicate that there is a mistake but not that big. ----------------------------------------------------------------------- """ for class_gt in preferences.CLASSES: for test_fname in test_fnames[class_gt]: # fundamental gt, in case of hier stage 1 # (while others contain 3 classes) class_gt_fund = test_fname.split('/')[3] # hardcoded data = np.load(test_fname) data_A, data_B = data_tools.extract_individual_data(data) N_observations = len(data_A) # len(data_B) is the same obs_data = data_tools.compute_observables(data_A, data_B) bins = {} for o in preferences.OBSERVABLES: bins[o] = data_tools.find_bins(o, obs_data[o]) p_posts = self.compute_probabilities(bins, alpha) ############################################################### # # binary_by_event # for i in range(0, N_observations): # get all instantaneous probabilities p_inst = {} #instantaneous probabilities for class_temp in preferences.CLASSES: p_inst[class_temp] = p_posts[class_temp][i] # class_est is the estimated class for this data point class_est = max(p_inst.items(), key=operator.itemgetter(1))[0] self.conf_mat['binary_by_event'][class_gt][class_est] += 1 if preferences.HIERARCHICAL is 'stage1': self.conf_mat['binary_by_event_with_gt_fund'][ class_gt_fund][class_est] += 1 # IMPORTANT: # This matrix needs to be scaled such that each row adds up # to 1. This will be done when I write the matrix to the # txt data file ############################################################### # # binary_by_trajectory_voting # n_votes = {} for class_temp in preferences.CLASSES: n_votes[class_temp] = 0 for i in range(0, N_observations): # get all instantaneous probabilities p_inst = {} #instantaneous probabilities for class_temp in preferences.CLASSES: p_inst[class_temp] = p_posts[class_temp][i] # one vote is given at every daya point # the vote goes to the class with highest post prob # class_est is the estimated class for this data point class_est = max(p_inst.items(), key=operator.itemgetter(1))[0] n_votes[class_est] += 1 # the estimated class is the one which receives highest number # of votes (along the trajectory) class_est_voting_winner = max(n_votes.items(), key=operator.itemgetter(1))[0] self.conf_mat['binary_by_trajectory_voting'][class_gt][ class_est_voting_winner] += 1 if preferences.HIERARCHICAL is 'stage1': self.conf_mat['binary_by_trajectory_voting_with_gt_fund'][ class_gt_fund][class_est_voting_winner] += 1 # IMPORTANT: # This matrix needs to be scaled such that each row adds up # to 1. This will be done when I write the matrix to the # txt data file ############################################################### # # binary_by_trajectory_probability # p_mean = {} for class_est in preferences.CLASSES: # class_est is not really the 'output decision' p_mean[class_est] = np.mean(p_posts[class_est]) p_max = max(p_mean.items(), key=operator.itemgetter(1))[1] c_out = max(p_mean.items(), key=operator.itemgetter(1))[0] self.conf_mat['binary_by_trajectory_probability'][class_gt][ c_out] += 1 if preferences.HIERARCHICAL is 'stage1': self.conf_mat[ 'binary_by_trajectory_probability_with_gt_fund'][ class_gt_fund][c_out] += 1 # IMPORTANT: # This matrix needs to be scaled such that each row adds up # to 1. This will be done when I write the matrix to the # txt data file ############################################################### # # probabilistic_by_event # for i in range(0, N_observations): # get all instantaneous probabilities p_inst = {} #instantaneous probabilities for class_temp in preferences.CLASSES: p_inst[class_temp] = p_posts[class_temp][i] self.conf_mat['probabilistic_by_event'][class_gt][ class_temp] += p_inst[class_temp] if preferences.HIERARCHICAL is 'stage1': self.conf_mat[ 'probabilistic_by_event_with_gt_fund'][ class_gt_fund][class_temp] += p_inst[ class_temp] p_max = max(p_mean.items(), key=operator.itemgetter(1))[1] p_gt = p_inst[class_gt] confidence = 1 - (p_max - p_gt) self.confidence['probabilistic_by_event'][class_gt][ 'n_observations'] += 1 self.confidence['probabilistic_by_event'][class_gt][ 'cum_confidence'] += confidence self.confidence['probabilistic_by_event'][class_gt][ 'cum_confidence_sq'] += confidence**2 # IMPORTANT: # Conf_mat needs to be scaled such that each row adds up # to 1. This will be done by the scale_conf_mats function # In addition, I will derive the statistics regarding confidence, # ie mean and std, from the three stored values ############################################################### # # probabilistic_by_trajectory # p_mean = {} for class_est in preferences.CLASSES: # class_est is not really the 'output decision' p_mean[class_est] = np.mean(p_posts[class_est]) self.conf_mat['probabilistic_by_trajectory'][class_gt][ class_est] += p_mean[class_est] if preferences.HIERARCHICAL is 'stage1': self.conf_mat[ 'probabilistic_by_trajectory_with_gt_fund'][ class_gt_fund][class_est] += p_mean[class_est] p_max = max(p_mean.items(), key=operator.itemgetter(1))[1] p_gt = p_mean[class_gt] confidence = 1 - (p_max - p_gt) self.confidence['probabilistic_by_trajectory'][class_gt][ 'n_observations'] += 1 self.confidence['probabilistic_by_trajectory'][class_gt][ 'cum_confidence'] += confidence self.confidence['probabilistic_by_trajectory'][class_gt][ 'cum_confidence_sq'] += confidence**2 # IMPORTANT: # Conf_mat needs to be scaled such that each row adds up # to 1. This will be done by the scale_conf_mats function # In addition, I will derive the statistics regarding confidence, # ie mean and std, from the three stored values ############################################################### # # empirical_probability_by_trajectory # n_votes = {} for class_temp in preferences.CLASSES: n_votes[class_temp] = 0 for i in range(0, N_observations): # get all instantaneous probabilities p_inst = {} #instantaneous probabilities for class_temp in preferences.CLASSES: p_inst[class_temp] = p_posts[class_temp][i] # one vote is given at every daya point # the vote goes to the class with highest post prob # class_est is the estimated class for this data point class_est = max(p_inst.items(), key=operator.itemgetter(1))[0] n_votes[class_est] += 1 # scale the votes to 1, such that they represent probabilities factor = 1.0 / sum(n_votes.values()) class_est_emp_probs = { k: v * factor for k, v in n_votes.items() } for class_est in preferences.CLASSES: # class_est is not really the 'output decision' # here I only keep the probability associated with every # possible outcome self.conf_mat['empirical_probability_by_trajectory'][class_gt][class_est] += \ class_est_emp_probs[class_est] if preferences.HIERARCHICAL is 'stage1': self.conf_mat['empirical_probability_by_trajectory_with_gt_fund'][class_gt_fund][class_est] += \ class_est_emp_probs[class_est] p_max = max(class_est_emp_probs.items(), key=operator.itemgetter(1))[1] p_gt = class_est_emp_probs[class_gt] confidence = 1 - (p_max - p_gt) self.confidence['empirical_probability_by_trajectory'][ class_gt]['n_observations'] += 1 self.confidence['empirical_probability_by_trajectory'][ class_gt]['cum_confidence'] += confidence self.confidence['empirical_probability_by_trajectory'][ class_gt]['cum_confidence_sq'] += confidence**2