def set_test_dists(self, test_fnames): # initialize empty histograms # since one histogram/pdf is computed for each element of test set # as below, it needs to be initialized at every testing for c in preferences.CLASSES: self.test_histograms[c], self.test_pdfs[c] = {}, {} for test_fname in test_fnames[c]: self.test_histograms[c][test_fname], self.test_pdfs[c][ test_fname] = {}, {} for o in preferences.OBSERVABLES: self.test_histograms[c][test_fname][ o] = generic_tools.initialize_histogram(o) self.test_pdfs[c][test_fname][o] = [] # compute histograms for each class (using test set) for c in preferences.CLASSES: for test_fname in test_fnames[c]: data = np.load(test_fname) data_A, data_B = generic_tools.extract_individual_data(data) obs_data = generic_tools.compute_observables(data_A, data_B) for o in preferences.OBSERVABLES: self.test_histograms[c][test_fname][ o] = generic_tools.compute_histogram_1D( o, obs_data[o]) for c in preferences.CLASSES: for test_fname in test_fnames[c]: for o in preferences.OBSERVABLES: self.test_pdfs[c][test_fname][ o] = generic_tools.compute_pdf( o, self.test_histograms[c][test_fname][o])
def set_train_dists(self, train_fnames): # initialize empty histograms # since histogram is accumulated as below, it needs to be initialized # at every training for c in preferences.CLASSES: self.train_histograms[c] = {} self.train_pdfs[c] = {} for o in preferences.OBSERVABLES: self.train_histograms[c][ o] = generic_tools.initialize_histogram(o) # compute histograms for each class (using training set) for c in preferences.CLASSES: for train_fname in train_fnames[c]: data = np.load(train_fname) data_A, data_B = generic_tools.extract_individual_data(data) obs_data = generic_tools.compute_observables(data_A, data_B) for o in preferences.OBSERVABLES: self.train_histograms[c][ o] += generic_tools.compute_histogram_1D( o, obs_data[o]) for c in preferences.CLASSES: for o in preferences.OBSERVABLES: self.train_pdfs[c][o] = generic_tools.compute_pdf( o, self.train_histograms[c][o])
def train(self, train_fnames): train_histograms1D = {} # initialize empty histograms for o in preferences.OBSERVABLES: train_histograms1D[o], self.train_pdfs1D[o] = {}, {} for c in preferences.CLASSES: train_histograms1D[o][c] = generic_tools.initialize_histogram( o) # compute histograms for each class for c in preferences.CLASSES: for file_path in train_fnames[c]: data = np.load(file_path) data_A, data_B = generic_tools.extract_individual_data(data) obs_data = generic_tools.compute_observables(data_A, data_B) for o in preferences.OBSERVABLES: train_histograms1D[o][ c] += generic_tools.compute_histogram_1D( o, obs_data[o]) for o in preferences.OBSERVABLES: for c in preferences.CLASSES: self.train_pdfs1D[o][c] = generic_tools.compute_pdf( o, train_histograms1D[o][c])
def trainKDE(self, train_fnames): year, month, day, hour, minute = time.strftime("%Y,%m,%d,%H,%M").split(',') out_fname_bw_ests = 'results/bw_est_stability/'+ year +'_'+ month +'_'+ day +'_'+ hour +'_'+ \ minute + '_'+ 'KDE_BW_estimation_stability.txt' for c in preferences.CLASSES: self.kernels[c] = [] values = [] for file_path in train_fnames[c]: data = np.load(file_path) data_A, data_B = generic_tools.extract_individual_data(data) obs_data = generic_tools.compute_observables(data_A, data_B) # len(data_A) and len(data_B) are the same for j in range(0, len(data_A)): # prepare data point data_pt = [] for o in preferences.OBSERVABLES: data_pt.append(obs_data[o][j]) values.append(data_pt) # optimizing kernel bandwidth with sklearn grid search params = {'bandwidth': np.linspace(preferences.BW0, preferences.BWF, preferences.NBINS_BW)} grid = GridSearchCV(KernelDensity(kernel='gaussian'), params, cv=preferences.NCV_BW) # I recently upgraded scikit-learn version to 0.21.dev0. # The following line gives deprecation warning: # DeprecationWarning: The default of the `iid` parameter will change # from True to False in version 0.22 and will be removed in 0.24. # This will change numeric results when test-set sizes are unequal. # DeprecationWarning) grid.fit(np.array(values)) bw = grid.best_estimator_.bandwidth with open(out_fname_bw_ests, "a") as myfile: myfile.write(('{}\t{}\n'.format(c, bw))) self.kernels[c] = KernelDensity(bandwidth = bw, \ kernel='gaussian', algorithm='ball_tree') self.kernels[c].fit(np.array(values))
def train(self, train_fnames): train_histograms_ND = {} for c in preferences.CLASSES: train_histograms_ND[c] = generic_tools.initialize_histogram_ND() # compute histograms for each class for c in preferences.CLASSES: for file_path in train_fnames[c]: data = np.load(file_path) data_A, data_B = generic_tools.extract_individual_data(data) obs_data = generic_tools.compute_observables(data_A, data_B) temp, edges = generic_tools.compute_histogram_ND( obs_data ) train_histograms_ND[c] += temp self.train_pdfs_ND[c] = generic_tools.compute_pdf_ND(train_histograms_ND[c])
def trainMA(self, train_fnames, sizeMA): """ Apply a moving average filter over the pdfs """ train_histograms_ND = {} for c in preferences.CLASSES: train_histograms_ND[c] = generic_tools.initialize_histogram_ND() # compute histograms for each class for c in preferences.CLASSES: for file_path in train_fnames[c]: data = np.load(file_path) data_A, data_B = generic_tools.extract_individual_data(data) obs_data = generic_tools.compute_observables(data_A, data_B) temp, edges = generic_tools.compute_histogram_ND( obs_data ) train_histograms_ND[c] += temp temp = generic_tools.compute_pdf_ND(train_histograms_ND[c]) self.train_pdfs_ND[c] = ndimage.uniform_filter(temp, size=preferences.SIZE_MA)
def estimate(self, alpha, test_fnames): """ Performance is evaluated in various ways. ----------------------------------------------------------------------- event-based: treats each point on trajectory as an event. For each event, we make an instantaneous decision (koibito, yujin, etc). For instance, we have post probabilities as follows: time = t [K D Y Kz] time = 0 [0.45 0.20 0.10 0.25] time = 1 [0.20 0.10 0.45 0.25] time = 2 [0.45 0.20 0.10 0.25] time = 3 [0.20 0.45 0.10 0.25] time = 4 [0.45 0.20 0.10 0.25] time = 5 [0.25 0.20 0.10 0.45] time = 6 [0.45 0.20 0.10 0.25] time = 7 [0.20 0.45 0.10 0.25] Each vector involves (post) probabilities for koibito (K), doryo (D), yujin (Y), kazoku (Kz), respectively. Then the instantaneous decisions will be: [K Y K D K Kz K D] ----------------------------------------------------------------------- event-based + voting: picks the class with highest number of votes among all events. So eventually the dyad has a single label (discrete output) For the above case, the votes are as follows: K = 4 D = 2 Y = 1 Kz= 1 So the output will be K. If this decision is correct it will give a 1, otherwise a 0. Actually in the confusion matrix, I also store the exact mistakes (off-diagonal). ----------------------------------------------------------------------- event-based + empirical probability: the instantaneous decisions are expressed as empirical pribabilities. For instance, for the above example, the empirical probabilities are: [4/8 2/8 1/8 1/8] for koibito (K), doryo (D), yujin (Y), kazoku (Kz), respectively. I use a confusion matrix to see the off-diagonal. ----------------------------------------------------------------------- trajectory-based: treats a trajectory as a single entity. See below for details of : trajectory-based + prob trajectory-based + confidence ----------------------------------------------------------------------- trajectory-based + prob: returns the probabilies of each possible outcome as an average of probabilies at each time instant. For the above case, we compute cumulative probabilities as an average of probabilies at each time instant as follows: K = mean([.45, .20, .45, .20, .45, .25, .45, .20]) = 0.33125 D = mean([.20, .10, .20, .45, .20, .20, .20, .45]) = 0.25 Y = mean([.10, .45, .10, .10, .10, .10, .10, .10]) = 0.14375 Kz= mean([.25, .25, .25, .25, .25, .45, .25, .25]) = 0.275 ----------------------------------------------------------------------- trajectory-based + binary: returns the class with highest probability as the output (decision) class. For the above case, the decision will be K. K = 0.33125 D = 0.25 Y = 0.14375 Kz= 0.275 argmax([K, D, Y, Kz]) = K ----------------------------------------------------------------------- trajectory-based + confidence: returns a confidence metric which is defined as below: conf = 100 - abs(p_max - p_gt) Here p_max is the highest probability (among the probabiities associated with each possible outcome (ie class)). On the other hand, p_gt is the probability that is associated with the gt class. For the above case, asuming thatthe gt class is D, conf will be: conf = 100 - abs(33.125 - 25) = 91.875 This value is 100 when the highest probability is associated with the gt class. When another class other than the gt class has a higher probability, it gives the extent of the difference. Values close to 100 indicate that there is a mistake but not that big. ----------------------------------------------------------------------- collective: treats all observations from each gt class equally. Namely, it boils down to four long trajectories for koibito, doryo, yujin and kazoku. See below for details of : collective + confidence collective + binary ----------------------------------------------------------------------- collective + confidence: I compute confidence at each single observation point (ie trajectory point) I do not store all these values. Instead, I store only the variables to compute statistics. Namely: the number of observations the sum confidence values the sum of squares of confidence values ----------------------------------------------------------------------- collective + binary: At each observation point, I make a binary decision and store the number of success and fails. The keys in the dictionary are: n_suc n_fail """ for class_gt in preferences.CLASSES: for test_fname in test_fnames[class_gt]: data = np.load(test_fname) data_A, data_B = generic_tools.extract_individual_data(data) N_observations = len(data_A) # len(data_B) is the same obs_data = generic_tools.compute_observables(data_A, data_B) bins = {} for o in preferences.OBSERVABLES: bins[o] = generic_tools.find_bins(o, obs_data[o]) p_posts = self.compute_probabilities(bins, alpha) ############################################################### # # event based # n_votes = {} for class_temp in preferences.CLASSES: n_votes[class_temp] = 0 for i in range(0, N_observations): # get all instantaneous probabilities p_inst = {} #instantaneous probabilities for class_temp in preferences.CLASSES: p_inst[class_temp] = p_posts[class_temp][i] # the votes goes to the class with highest prob # clas_est is the estimated class class_est = max(p_inst.items(), key=operator.itemgetter(1))[0] n_votes[class_est] += 1 class_est_voting_winner = max(n_votes.items(), key=operator.itemgetter(1))[0] self.conf_mat['event_based']['voting'][class_gt][ class_est_voting_winner] += 1 # scale the votes to 1, such that they represent probabilities factor = 1.0 / sum(n_votes.values()) class_est_emp_probs = { k: v * factor for k, v in n_votes.items() } for class_est in preferences.CLASSES: # class_est is not really the 'output decision' # here I only keep the probability associated with every # possible outcome self.conf_mat['event_based']['emp_probs'][class_gt][class_est] += \ class_est_emp_probs[class_est] ############################################################### # # trajectory-based # p_mean = {} for class_est in preferences.CLASSES: # class_est is not really the 'output decision' self.conf_mat['trajectory_based']['prob'][class_gt][class_est].append(\ np.mean(p_posts[class_est])) p_mean[class_est] = np.mean(p_posts[class_est]) p_max = max(p_mean.items(), key=operator.itemgetter(1))[1] c_out = max(p_mean.items(), key=operator.itemgetter(1))[0] self.conf_mat['trajectory_based']['binary'][class_gt][ c_out] += 1 p_gt = p_mean[class_gt] confidence = 1 - (p_max - p_gt) self.conf_mat['trajectory_based']['confidence'][ class_gt].append(confidence) ############################################################### # # collectively, ie dumping all observations from each class in # one set, as if it is one long trajectory # temp_suc = n_votes[class_gt] temp_fail = 0 for class_est in preferences.CLASSES: if class_est is not class_gt: temp_fail += n_votes[class_est] self.conf_mat['collective']['binary'][class_gt][ 'n_suc'] += temp_suc self.conf_mat['collective']['binary'][class_gt][ 'n_fail'] += temp_fail ############################################################### # # collective + confidence # There is lots of overlap between event-based # temp_cum_n_observations = N_observations temp_cum_confidence = 0 temp_cum_confidence_sq = 0 for i in range(0, N_observations): # get all instantaneous probabilities p_inst = {} #instantaneous probabilities for class_temp in preferences.CLASSES: p_inst[class_temp] = p_posts[class_temp][i] # clas_est is the the one with highest prob class_est = max(p_inst.items(), key=operator.itemgetter(1))[0] # p_est is the highest probability (ie the probability of # class_est). So I use p_est to compute confidence at this # instant p_est = max(p_inst.items(), key=operator.itemgetter(1))[1] temp = 1 - (p_est - p_inst[class_gt]) temp_cum_confidence += temp temp_cum_confidence_sq += (temp * temp) self.conf_mat['collective']['confidence'][class_gt][ 'cum_n_observations'] += temp_cum_n_observations self.conf_mat['collective']['confidence'][class_gt][ 'cum_confidence'] += temp_cum_confidence self.conf_mat['collective']['confidence'][class_gt][ 'cum_confidence_sq'] += temp_cum_confidence_sq
def estimate(self, alpha, filtering, test_fnames): for class_gt in preferences.CLASSES: for t, test_fname in enumerate(test_fnames[class_gt]): data = np.load(test_fname) data_A, data_B = generic_tools.extract_individual_data(data) N_observations = len(data_A) # len(data_B) is the same obs_data = generic_tools.compute_observables(data_A, data_B) if filtering is 'none': bins = generic_tools.find_bins_ND(obs_data) p_posts = self.compute_probabilities_ND_woKDE(bins, alpha) elif filtering is 'KDE': p_posts = self.compute_probabilities_ND_wKDE(N_observations, obs_data, alpha) elif filtering is 'MA': bins = generic_tools.find_bins_ND(obs_data) p_posts = self.compute_probabilities_ND_woKDE(bins, alpha) else: print('bayesian_model_dep Line 293: preferences.FILTERING status is undefined') ############################################################### # # event based # n_votes = {} for class_temp in preferences.CLASSES: n_votes[class_temp] = 0 for i in range(0, N_observations): # get all instantaneous probabilities p_inst = {} #instantaneous probabilities for class_temp in preferences.CLASSES: p_inst[class_temp] = p_posts[class_temp][i] # the votes goes to the class with highest prob class_est = max(p_inst.items(), key=operator.itemgetter(1))[0] n_votes[class_est] += 1 class_est_voting_winner = max(n_votes.items(), key=operator.itemgetter(1))[0] self.conf_mat['event_based']['voting'][class_gt][class_est_voting_winner] += 1 # scale the votes to 1, such that they represent probabilities factor = 1.0/sum(n_votes.values()) class_est_emp_probs = {k: v*factor for k, v in n_votes.items() } for class_est in preferences.CLASSES: # class_est is not really the 'output decision' # here I only keep the probability associated with every # possible outcome self.conf_mat['event_based']['emp_probs'][class_gt][class_est] += \ class_est_emp_probs[class_est] ############################################################### # # trajectory-based # p_mean = {} for class_est in preferences.CLASSES: # class_est is not really the 'output decision' self.conf_mat['trajectory_based']['prob'][class_gt][class_est].append(\ np.mean(p_posts[class_est])) p_mean[class_est] = np.mean(p_posts[class_est]) p_max = max(p_mean.items(), key=operator.itemgetter(1))[1] c_out = max(p_mean.items(), key=operator.itemgetter(1))[0] self.conf_mat['trajectory_based']['binary'][class_gt][c_out] += 1 p_gt = p_mean[class_gt] confidence = 1 - (p_max - p_gt) self.conf_mat['trajectory_based']['confidence'][class_gt].append(confidence) ############################################################### # # collectively, ie dumping all observations from each class in # one set, as if it is one long trajectory # temp_suc = n_votes[class_gt] temp_fail = 0 for class_est in preferences.CLASSES: if class_est is not class_gt: temp_fail += n_votes[class_est] self.conf_mat['collective']['binary'][class_gt]['n_suc'] += temp_suc self.conf_mat['collective']['binary'][class_gt]['n_fail'] += temp_fail ############################################################### # # collective + confidence # There is lots of overlap between event-based # temp_cum_n_observations = N_observations temp_cum_confidence = 0 temp_cum_confidence_sq = 0 for i in range(0, N_observations): # get all instantaneous probabilities p_inst = {} #instantaneous probabilities for class_temp in preferences.CLASSES: p_inst[class_temp] = p_posts[class_temp][i] # clas_est is the the one with highest prob class_est = max(p_inst.items(), key=operator.itemgetter(1))[0] # p_est is the highest probability (ie the probability of # class_est). So I use p_est to compute confidence at this # instant p_est = max(p_inst.items(), key=operator.itemgetter(1))[1] temp = 1 - (p_est - p_inst[class_gt]) temp_cum_confidence += temp temp_cum_confidence_sq += (temp*temp) self.conf_mat['collective']['confidence'][class_gt]['cum_n_observations'] += temp_cum_n_observations self.conf_mat['collective']['confidence'][class_gt]['cum_confidence'] += temp_cum_confidence self.conf_mat['collective']['confidence'][class_gt]['cum_confidence_sq'] += temp_cum_confidence_sq
histograms1D[o], pdf1D[o], mean_pdfs[o] = {}, {}, {} for c in preferences.CLASSES: histograms1D[o][c] = [] pdf1D[o][c] = [] mean_pdfs[o][c] = [] data_fnames = generic_tools.get_data_fnames('data/classes/') for c in preferences.CLASSES: for file_path in data_fnames[c]: data = np.load(file_path) data_A, data_B = generic_tools.extract_individual_data(data) obs_data = generic_tools.compute_observables(data_A, data_B) for o in preferences.OBSERVABLES: edges = get_edges(o) temp_hist = generic_tools.compute_histogram_1D(o, obs_data[o]) temp_pdf = generic_tools.compute_pdf(o, temp_hist) histograms1D[o][c].append(temp_hist ) pdf1D[o][c].append( temp_pdf ) mean_pdfs[o][c].append( np.average(edges, weights=temp_pdf) ) print('Obs\tF_d\tv1\tv2') print('-------------------------------')