def set_test_dists(self, test_fnames):
        
        # initialize empty histograms
        # since one histogram/pdf  is computed for each element of test set
        # as below, it needs to be initialized at every testing
        for c in preferences.CLASSES:
            self.test_histograms[c], self.test_pdfs[c] = {}, {}
            for test_fname in test_fnames[c]:
                self.test_histograms[c][test_fname], self.test_pdfs[c][test_fname] = {}, {}
                for o in preferences.OBSERVABLES:
                    self.test_histograms[c][test_fname][o] = data_tools.initialize_histogram(o)
                    self.test_pdfs[c][test_fname][o] = []
                
        # compute histograms for each class (using test set)
        for c in preferences.CLASSES:   
            for test_fname in test_fnames[c]:

                data = np.load(test_fname)
                data_A, data_B = data_tools.extract_individual_data(data)
                obs_data = data_tools.compute_observables(data_A, data_B)
                
                for o in preferences.OBSERVABLES:
                    self.test_histograms[c][test_fname][o] = data_tools.compute_histogram_1D(o, obs_data[o])

        for c in preferences.CLASSES:
            for test_fname in test_fnames[c]:
                for o in preferences.OBSERVABLES:
                    self.test_pdfs[c][test_fname][o] = data_tools.compute_pdf(o, self.test_histograms[c][test_fname][o])
 def set_train_dists(self, train_fnames):
     
     # initialize empty histograms
     # since histogram is accumulated as below, it needs to be initialized 
     # at every training           
     for c in preferences.CLASSES:
         self.train_histograms[c] = {}
         self.train_pdfs[c] = {}
         for o in preferences.OBSERVABLES:
             self.train_histograms[c][o] = data_tools.initialize_histogram(o)
                             
     # compute histograms for each class (using training set)
     for c in preferences.CLASSES:   
         for train_fname in train_fnames[c]:
             
             data = np.load(train_fname)
             data_A, data_B = data_tools.extract_individual_data(data)
             obs_data = data_tools.compute_observables(data_A, data_B)
             
             for o in preferences.OBSERVABLES:
                 self.train_histograms[c][o] += data_tools.compute_histogram_1D(o, obs_data[o])
                 
     for c in preferences.CLASSES:
         for o in preferences.OBSERVABLES:
             self.train_pdfs[c][o] = data_tools.compute_pdf(o, self.train_histograms[c][o])
예제 #3
0
    def train(self, train_fnames):

        train_histograms1D = {}
        # initialize empty histograms
        for o in preferences.OBSERVABLES:
            train_histograms1D[o], self.train_pdfs1D[o] = {}, {}
            for c in preferences.CLASSES:
                train_histograms1D[o][c] = data_tools.initialize_histogram(o)

        # compute histograms for each class
        for c in preferences.CLASSES:
            for file_path in train_fnames[c]:
                data = np.load(file_path)
                data_A, data_B = data_tools.extract_individual_data(data)
                obs_data = data_tools.compute_observables(data_A, data_B)
                for o in preferences.OBSERVABLES:
                    train_histograms1D[o][
                        c] += data_tools.compute_histogram_1D(o, obs_data[o])

        for o in preferences.OBSERVABLES:
            for c in preferences.CLASSES:
                self.train_pdfs1D[o][c] = data_tools.compute_pdf(
                    o, train_histograms1D[o][c])
예제 #4
0
if __name__ == "__main__":

    start_time = time.time()
    
    data_fnames = file_tools.get_data_fnames('../data/gender_compositions/')

    histograms1D = {}
    pdfs1D = {}
    # initialize empty histograms
    for o in preferences.OBSERVABLES:
        histograms1D[o], pdfs1D[o] = {}, {}
        for c in preferences.CLASSES_RAW:
            histograms1D[o][c] = data_tools.initialize_histogram(o)
            
    # compute histograms for each class
    for c in preferences.CLASSES_RAW:   
        for file_path in data_fnames[c]:
            data = np.load(file_path)
            data_A, data_B = data_tools.extract_individual_data(data)
            obs_data = data_tools.compute_observables(data_A, data_B)
            for o in preferences.OBSERVABLES:
                histograms1D[o][c] += data_tools.compute_histogram_1D(o, obs_data[o])
                
    for o in preferences.OBSERVABLES:
        for c in preferences.CLASSES_RAW:
            pdfs1D[o][c] = data_tools.compute_pdf(o, histograms1D[o][c])
            
    plot_pdf(pdfs1D)
            
    elapsed_time = time.time() - start_time
    print('\nTime elapsed  %2.2f sec' %elapsed_time)
예제 #5
0
    def estimate(self, alpha, test_fnames):
        """
        
        Performance is evaluated in various ways. Below I explain each of these
        on a toy example.
        
        Lets assume we have a dyad whic nis annotated to be Doryo (D) and its 
        trajectory of length 8. Suppose the post probabilities are found 
        as follows:
            
            time = t [K     D     Y     Kz  ]
            time = 0 [0.45  0.20  0.10  0.25]
            time = 1 [0.20  0.10  0.45  0.25]
            time = 2 [0.45  0.20  0.10  0.25]
            time = 3 [0.20  0.45  0.10  0.25]
            time = 4 [0.45  0.20  0.10  0.25]
            time = 5 [0.25  0.20  0.10  0.45]
            time = 6 [0.45  0.20  0.10  0.25]
            time = 7 [0.20  0.45  0.10  0.25]
           
        Here, each vector involves (post) probabilities for koibito (K), 
        doryo (D), yujin (Y), and kazoku (Kz), respectively.
            
        -----------------------------------------------------------------------
        trajectory-based decision: yields a single estimation of social 
        relation for each dyad. 
        
        TODO
            
        -----------------------------------------------------------------------      
        binary_by_event
        
        For each trajectory data point (event), we make an 
        instantaneous decision by choosing the social relation class with highest 
        post probability.
        
        Then the instantaneous decisions will be:
                [K Y K D K Kz K D]
                
        We build a confusion matrix such that the rows represent the true class
        and the columns represent the assigned class. The above dyad contributes 
        to the confusion matrix (before scaling) as follows:
                                               
                                    [K D Y Kz ]
                                    | 0 0 0 0 |
         old_mats_before_scaling +  | 4 2 1 1 |
                                    | 0 0 0 0 |
                                    | 0 0 0 0 |
                                    
        After processing all dyads, each row is scaled to 1.
        -----------------------------------------------------------------------
        binary_by_trajectory_voting
        
        According to this assessment method, for each dyad we pick the social 
        relation class with highest number of votes among all trajectory data
        points. So eventually the dyad gets a single estimation (discrete output)
        
        For the above case, the votes are as follows:
            K = 4
            D = 2
            Y = 1
            Kz= 1
            
        So the estimated class will be K. If this decision is correct, it 
        will give a 1, otherwise a 0. Actually in the confusion matrix, I also 
        store the exact mistakes (off-diagonal).
        
        The above dyad contributes to the confusion matrix (before scaling) as 
        follows:
                                    [K D Y Kz ]
                                    | 0 0 0 0 |
         old_mats_before_scaling +  | 1 0 0 0 |
                                    | 0 0 0 0 |
                                    | 0 0 0 0 |
                                    
        After processing all dyads, each row is scaled to 1.        
        -----------------------------------------------------------------------
        binary_by_trajectory_probability
        
        For every dyad, we take the average probability concering each social 
        relation class over all trajectory data points. The class with the 
        highest probability is the esmtimated class.
        
        For the above case, the average probabilities are:
            
            K = 0.33125
            D = 0.25
            Y = 0.14375
            Kz= 0.275
        
        So the decision will be K.
            
        The above dyad contributes to the confusion matrix (before scaling) as 
        follows:
                                    [K D Y Kz ]
                                    | 0 0 0 0 |
         old_mats_before_scaling +  | 1 0 0 0 |
                                    | 0 0 0 0 |
                                    | 0 0 0 0 |

        After processing all dyads, each row is scaled to 1.        
        -----------------------------------------------------------------------
        probabilistic_by_event
        
        This assessment method yields two outcomes: 
            1. Confusion matrix
            2. Confidence values
        
        1. Confusion matrix:
            I accumulate the post probabilites derived from each trajectory data 
            point of each dyad, in a confusion matrix. 
            
        The above dyad contributes to the confusion matrix (before scaling) as 
        follows:
                                     | 0     0     0     0   |
            old_mats_before_scaling +| 0.45  0.20  0.10  0.25| + 
                                     | 0     0     0     0   |
                                     | 0     0     0     0   |
                                     
                | 0     0     0     0   |   | 0     0     0     0   |
                | 0.20  0.10  0.45  0.25| + | 0.45  0.20  0.10  0.25| + ...
                | 0     0     0     0   |   | 0     0     0     0   |
                | 0     0     0     0   |   | 0     0     0     0   |
                
                | 0     0     0     0   |   
                | 0.20  0.45  0.10  0.25|
                | 0     0     0     0   |  
                | 0     0     0     0   |   

        After processing all dyads, each row is scaled to 1.        

        2. Confidence values:
            
            The confidence is defined as below:
                conf = 100 - abs(p_max - p_gt)
        
            Here p_max is the highest probability (among the probabiities associated 
            with each possible outcome (ie class)). On the other hand, p_gt is the 
            probability that is associated with the gt class.
        
            I compute confidence at each single observation point (ie trajectory 
            point) I do not store all these values. Instead, I store only the 
            variables necessary to compute statistics. Namely:            
                the number of observations
                the sum confidence values
                the sum of squares of confidence values
                
            For the above example, the confidence values will be
            
            time = 1 - (0.45 - 0.20 ) = 0.75
            time = 1 - (0.45 - 0.10 ) = 0.65
            time = 1 - (0.45 - 0.20 ) = 0.75
            time = 1 - (0.45 - 0.45 ) = 1
            time = 1 - (0.45 - 0.20 ) = 0.75
            time = 1 - (0.45 - 0.20 ) = 0.75
            time = 1 - (0.45 - 0.20 ) = 0.75
            time = 1 - (0.45 - 0.45 ) = 1
            
            and I will update the stored values as follows:
                number_of_observations += 8
                sum_confidence_values += 0.75 + 0.65 + 0.75 + 1 + 0.75 + 0.75 + 
                0.75 + 1
                sum_of_squares_of_confidence_values = 0.75**2 + 0.65**2 + 
                0.75**2 + 1**2 + 0.75**2 + 0.75**2 + 0.75**2 + 1**2
                
        -----------------------------------------------------------------------
        probabilistic_by_trajectory
        
        This assessment method yields two outcomes: 
            1. Confusion matrix
            2. Confidence values
            
        1. Confusion matrix:
            Remember from binary_by_trajectory_probability that we computed the 
            average probability concering each social relation class over all 
            trajectory data points.         
            
            For the above case, the average probabilities were:
                K = 0.33125
                D = 0.25
                Y = 0.14375
                Kz= 0.275
            
            The above dyad contributes to the confusion matrix (before scaling)
            as follows:
                                         | 0        0     0        0    |
                old_mats_before_scaling +| 0.33125  0.25  0.14375  0.275| 
                                         | 0        0     0        0    |
                                         | 0        0     0        0    |
                                         
            After processing all dyads, each row is scaled to 1.        

                
        2. Confidence values:
            I compute the confidece on the above probability vector:
                confidence = 1 - (0.33125 - 0.25)
                           = 0.91875

            and I update the stored values as follows:
                number_of_observations += 1
                sum_confidence_values += 0.91875
                sum_of_squares_of_confidence_values = 0.91875**2        
        -----------------------------------------------------------------------
        empirical_probability_by_trajectory

        This assessment method yields two outcomes: 
            1. Confusion matrix
            2. Confidence values
        
        1. Confusion matrix:
            Here, I first derive a probability vector from the votes given at 
            each data point.
            
            For the above case, the votes were as follows:
            K = 4
            D = 2
            Y = 1
            Kz= 1
            
            So I build a probability vector as:
                [K   D   Y   Kz ]
                [4/8 2/8 1/8 1/8]
                
            Then this vector is added to the confusion matrix as below
                                     | 0     0     0      0    |
            old_mats_before_scaling +| 0.50  0.25  0.125  0.125| 
                                     | 0     0     0      0    |
                                     | 0     0     0      0    |
                                     
            After processing all dyads, each row is scaled to 1.        

        2. Confidence values:

            The confidence is defined as below:
                conf = 100 - abs(p_max - p_gt)
            
            This time p_max is the highest value in the above vector. Similar to
            the previous case, p_gt is the probability associated with the gt 
            class.
            
            For the above case, since the gt class is given as D, conf will be:
                conf = 1 - abs(0.50 - 0.25)
                       = 0.75
                       
            I will update the stored values as follows:
                number_of_observations += 1
                sum_confidence_values += 0.75 
                sum_of_squares_of_confidence_values += 0.75**2 
                   
            Same as before, values close to 1 indicate that there is a mistake 
            but not that big.
        
        -----------------------------------------------------------------------
        """

        for class_gt in preferences.CLASSES:

            for test_fname in test_fnames[class_gt]:

                # fundamental gt, in case of hier stage 1
                # (while others contain 3 classes)
                class_gt_fund = test_fname.split('/')[3]  # hardcoded

                data = np.load(test_fname)
                data_A, data_B = data_tools.extract_individual_data(data)
                N_observations = len(data_A)  # len(data_B) is the same
                obs_data = data_tools.compute_observables(data_A, data_B)

                bins = {}
                for o in preferences.OBSERVABLES:
                    bins[o] = data_tools.find_bins(o, obs_data[o])
                p_posts = self.compute_probabilities(bins, alpha)

                ###############################################################
                #
                # binary_by_event
                #
                for i in range(0, N_observations):
                    # get all instantaneous probabilities
                    p_inst = {}  #instantaneous probabilities
                    for class_temp in preferences.CLASSES:
                        p_inst[class_temp] = p_posts[class_temp][i]

                    # class_est is the estimated class for this data point
                    class_est = max(p_inst.items(),
                                    key=operator.itemgetter(1))[0]
                    self.conf_mat['binary_by_event'][class_gt][class_est] += 1
                    if preferences.HIERARCHICAL is 'stage1':
                        self.conf_mat['binary_by_event_with_gt_fund'][
                            class_gt_fund][class_est] += 1

                # IMPORTANT:
                # This matrix needs to be scaled such that each row adds up
                # to 1. This will be done when I write the matrix to the
                # txt data file
                ###############################################################
                #
                # binary_by_trajectory_voting
                #
                n_votes = {}
                for class_temp in preferences.CLASSES:
                    n_votes[class_temp] = 0

                for i in range(0, N_observations):
                    # get all instantaneous probabilities
                    p_inst = {}  #instantaneous probabilities
                    for class_temp in preferences.CLASSES:
                        p_inst[class_temp] = p_posts[class_temp][i]

                    # one vote is given at every daya point
                    # the vote goes to the class with highest post prob
                    # class_est is the estimated class for this data point
                    class_est = max(p_inst.items(),
                                    key=operator.itemgetter(1))[0]
                    n_votes[class_est] += 1

                # the estimated class is the one which receives highest number
                # of votes (along the trajectory)
                class_est_voting_winner = max(n_votes.items(),
                                              key=operator.itemgetter(1))[0]
                self.conf_mat['binary_by_trajectory_voting'][class_gt][
                    class_est_voting_winner] += 1
                if preferences.HIERARCHICAL is 'stage1':
                    self.conf_mat['binary_by_trajectory_voting_with_gt_fund'][
                        class_gt_fund][class_est_voting_winner] += 1

                # IMPORTANT:
                # This matrix needs to be scaled such that each row adds up
                # to 1. This will be done when I write the matrix to the
                # txt data file
                ###############################################################
                #
                # binary_by_trajectory_probability
                #
                p_mean = {}
                for class_est in preferences.CLASSES:
                    # class_est is not really the 'output decision'
                    p_mean[class_est] = np.mean(p_posts[class_est])

                p_max = max(p_mean.items(), key=operator.itemgetter(1))[1]
                c_out = max(p_mean.items(), key=operator.itemgetter(1))[0]
                self.conf_mat['binary_by_trajectory_probability'][class_gt][
                    c_out] += 1
                if preferences.HIERARCHICAL is 'stage1':
                    self.conf_mat[
                        'binary_by_trajectory_probability_with_gt_fund'][
                            class_gt_fund][c_out] += 1
                # IMPORTANT:
                # This matrix needs to be scaled such that each row adds up
                # to 1. This will be done when I write the matrix to the
                # txt data file
                ###############################################################
                #
                # probabilistic_by_event
                #
                for i in range(0, N_observations):
                    # get all instantaneous probabilities
                    p_inst = {}  #instantaneous probabilities
                    for class_temp in preferences.CLASSES:
                        p_inst[class_temp] = p_posts[class_temp][i]
                        self.conf_mat['probabilistic_by_event'][class_gt][
                            class_temp] += p_inst[class_temp]
                        if preferences.HIERARCHICAL is 'stage1':
                            self.conf_mat[
                                'probabilistic_by_event_with_gt_fund'][
                                    class_gt_fund][class_temp] += p_inst[
                                        class_temp]

                    p_max = max(p_mean.items(), key=operator.itemgetter(1))[1]
                    p_gt = p_inst[class_gt]
                    confidence = 1 - (p_max - p_gt)

                    self.confidence['probabilistic_by_event'][class_gt][
                        'n_observations'] += 1
                    self.confidence['probabilistic_by_event'][class_gt][
                        'cum_confidence'] += confidence
                    self.confidence['probabilistic_by_event'][class_gt][
                        'cum_confidence_sq'] += confidence**2
                # IMPORTANT:
                # Conf_mat needs to be scaled such that each row adds up
                # to 1. This will be done by the scale_conf_mats function
                # In addition, I will derive the statistics regarding confidence,
                # ie mean and std, from the three stored values
                ###############################################################
                #
                # probabilistic_by_trajectory
                #
                p_mean = {}
                for class_est in preferences.CLASSES:
                    # class_est is not really the 'output decision'
                    p_mean[class_est] = np.mean(p_posts[class_est])
                    self.conf_mat['probabilistic_by_trajectory'][class_gt][
                        class_est] += p_mean[class_est]
                    if preferences.HIERARCHICAL is 'stage1':
                        self.conf_mat[
                            'probabilistic_by_trajectory_with_gt_fund'][
                                class_gt_fund][class_est] += p_mean[class_est]

                p_max = max(p_mean.items(), key=operator.itemgetter(1))[1]
                p_gt = p_mean[class_gt]
                confidence = 1 - (p_max - p_gt)

                self.confidence['probabilistic_by_trajectory'][class_gt][
                    'n_observations'] += 1
                self.confidence['probabilistic_by_trajectory'][class_gt][
                    'cum_confidence'] += confidence
                self.confidence['probabilistic_by_trajectory'][class_gt][
                    'cum_confidence_sq'] += confidence**2
                # IMPORTANT:
                # Conf_mat needs to be scaled such that each row adds up
                # to 1. This will be done by the scale_conf_mats function
                # In addition, I will derive the statistics regarding confidence,
                # ie mean and std, from the three stored values
                ###############################################################
                #
                # empirical_probability_by_trajectory
                #
                n_votes = {}
                for class_temp in preferences.CLASSES:
                    n_votes[class_temp] = 0

                for i in range(0, N_observations):
                    # get all instantaneous probabilities
                    p_inst = {}  #instantaneous probabilities
                    for class_temp in preferences.CLASSES:
                        p_inst[class_temp] = p_posts[class_temp][i]

                    # one vote is given at every daya point
                    # the vote goes to the class with highest post prob
                    # class_est is the estimated class for this data point
                    class_est = max(p_inst.items(),
                                    key=operator.itemgetter(1))[0]
                    n_votes[class_est] += 1

                # scale the votes to 1, such that they represent probabilities
                factor = 1.0 / sum(n_votes.values())
                class_est_emp_probs = {
                    k: v * factor
                    for k, v in n_votes.items()
                }

                for class_est in preferences.CLASSES:
                    # class_est is not really the 'output decision'
                    # here I only keep the probability associated with every
                    # possible outcome
                    self.conf_mat['empirical_probability_by_trajectory'][class_gt][class_est] += \
                    class_est_emp_probs[class_est]

                    if preferences.HIERARCHICAL is 'stage1':
                        self.conf_mat['empirical_probability_by_trajectory_with_gt_fund'][class_gt_fund][class_est] += \
                        class_est_emp_probs[class_est]

                p_max = max(class_est_emp_probs.items(),
                            key=operator.itemgetter(1))[1]
                p_gt = class_est_emp_probs[class_gt]
                confidence = 1 - (p_max - p_gt)

                self.confidence['empirical_probability_by_trajectory'][
                    class_gt]['n_observations'] += 1
                self.confidence['empirical_probability_by_trajectory'][
                    class_gt]['cum_confidence'] += confidence
                self.confidence['empirical_probability_by_trajectory'][
                    class_gt]['cum_confidence_sq'] += confidence**2