def process_symbolic_data(data, standard_method=True, refined_method=False):
    """
    Given a list of trajectories (regularly sampled location integer symbols) returns
    the request upper bound(s) on the upper limit of predictability.
    
    Returns either a single value or two [standard_method, refined_method].
    
    :param data: List of trajectories
    :type data: List of List of int
    :param standard_method: True to calculate the upper bound on the upper limit of predictability using the original method by Song et. al. 
    :type standard_method: Boolean
    :param refined_method: True to calculate the upper bound on the upper limit of predictability using the refined method from our PERCOM paper.
    :type refined_method: Boolean
    """

    if refined_method:
        S_RL, N_RL = empiricalEntropyRate(data, "RL")

    if standard_method:
        S_DL, N_DL = empiricalEntropyRate(data, "DL")

    mlab.openPool()

    if refined_method:
        tmpG_RL = list(mlab.ParLoP(S_RL, N_RL)[0])

    if standard_method:
        tmpG_DL = list(mlab.ParLoP(S_DL, N_DL)[0])

    mlab.closePool()

    if standard_method:
        print "\nStandard method: AVG: {} MIN: {} MAX: {}".format(
            np.mean(np.asarray(tmpG_DL)), np.min(np.asarray(tmpG_DL)), np.max(np.asarray(tmpG_DL))
        )

    if refined_method:
        print "\nRefined method: AVG: {} MIN: {} MAX: {}".format(
            np.mean(np.asarray(tmpG_RL)), np.min(np.asarray(tmpG_RL)), np.max(np.asarray(tmpG_RL))
        )

    if refined_method and standard_method:
        return tmpG_DL, tmpG_RL

    if refined_method:
        return tmpG_RL

    return tmpG_DL
예제 #2
0
def process_symbolic_data(data, standard_method=True, refined_method=False):
    """
    Given a list of trajectories (regularly sampled location integer symbols) returns
    the request upper bound(s) on the upper limit of predictability.
    
    Returns either a single value or two [standard_method, refined_method].
    
    :param data: List of trajectories
    :type data: List of List of int
    :param standard_method: True to calculate the upper bound on the upper limit of predictability using the original method by Song et. al. 
    :type standard_method: Boolean
    :param refined_method: True to calculate the upper bound on the upper limit of predictability using the refined method from our PERCOM paper.
    :type refined_method: Boolean
    """

    if refined_method:
        S_RL, N_RL = empiricalEntropyRate(data, 'RL')

    if standard_method:
        S_DL, N_DL = empiricalEntropyRate(data, 'DL')

    mlab.openPool()

    if refined_method:
        tmpG_RL = list(mlab.ParLoP(S_RL, N_RL)[0])

    if standard_method:
        tmpG_DL = list(mlab.ParLoP(S_DL, N_DL)[0])

    mlab.closePool()

    if standard_method:
        print '\nStandard method: AVG: {} MIN: {} MAX: {}'.format(
            np.mean(np.asarray(tmpG_DL)), np.min(np.asarray(tmpG_DL)),
            np.max(np.asarray(tmpG_DL)))

    if refined_method:
        print '\nRefined method: AVG: {} MIN: {} MAX: {}'.format(
            np.mean(np.asarray(tmpG_RL)), np.min(np.asarray(tmpG_RL)),
            np.max(np.asarray(tmpG_RL)))

    if refined_method and standard_method:
        return tmpG_DL, tmpG_RL

    if refined_method:
        return tmpG_RL

    return tmpG_DL
def run( group = "All",scale = None, output_dir = './ResultsLoP_replication/final_graphs', bulk_build_preprocessing = False):
    """
    Generates a single heatmap for a given list of Geolife ids, for a given method of computing the upper bound on
    the upper limit of predictability.
    
    :param group: ["id_str",[list of ids in the geolife dataset]]
    :type group: Nested list
    :param scale: [min_z, max_z, step]  Set the scale of the heatmap z
    :type scale: Float array
    """
    t = time.time()
    
    #Group setting
    if(group == "All"):
        suffix = "All"
        persons = "All"
    else:
        suffix = "Grp{}".format(group[0])
        persons = group[1]
    
    
    if not output_dir[-1] == '/':
        output_dir = output_dir + '/'
        
    file_name = "{}Heatmap_{}".format(output_dir,suffix)
    
    ensure_dir(file_name)
    
    print "Calculing the LoP for {}".format(suffix)
    
    if bulk_build_preprocessing:
        # will attempt to bulk build the cache using multiple CPU cores
        # will skip caches if already built.
        # if this option is not specified and a cache does not exist
        # it will be built when required, using a single CPU core.
        GeolifeSymbolisation.bulk_build_resolution_cache(listSpatialRes, listTemporalRes)
    
    mlab.openPool()
    failed_ids = set()
    LoP_RL = []
    LoP_DL = []
    LoP_failed_ct = []
    passed_norm_test = []
    for spatialRes in listSpatialRes:
        LoP_RL.append([])
        LoP_DL.append([])
        LoP_failed_ct.append([])
        passed_norm_test.append([])
        for temporalRes in listTemporalRes:
            
            #Compute data

            #---------------------------------------------
            #Load data from an existing preproc database, this will have been created
            # earlier if it did not exist.    
            data, person_ids = get_geolife_data(spatialRes, temporalRes,persons)
            #---------------------------------------------
            
            # Sanity check on loading
            for person in data:
                if len(person) == 0:
                    raise Exception("One or more person's trajectory was not loaded/created correctly.")
            # End sanity check
            
            S_RL, N_RL = empiricalEntropyRate(data,'RL')
            S_DL, N_DL = empiricalEntropyRate(data,'DL')
                    
            #Save the average:

            tmpG_RL = list(mlab.ParLoP(S_RL, N_RL)[0])
            tmpG_DL = list(mlab.ParLoP(S_DL, N_DL)[0])
            
            #-88 real fail in solve
            #-99 known fail in solve when S > log2(N)
            # See the Matlab script (ParLoP.m) for more details
            
            if (np.asarray(tmpG_RL)==-88).any():
                raise Exception("ERROR: (RL) Matlab failed the solve, but the entropy was in the correct range. Therefore an unknown error has occured.")
            
            if (np.asarray(tmpG_DL)==-88).any():
                raise Exception("ERROR: (DL) Matlab failed the solve, but the entropy was in the correct range. Therefore an unknown error has occured.")
            
            
            # Replace known solve fails. These are the cases when an entropy is found that is to high. 
            # This means the LZ entropy rate estimate is wrong (the estimator has failed to converge)
            # There is no way to correct this, without collecting more data from the individual.
            # While excluding the individual is not ideal it is better than including a value that is 
            # *known* to be erroneous. Therefore we discard the individual. 
            tmpG_RL = np.asarray(tmpG_RL)
            tmpG_DL = np.asarray(tmpG_DL)
            
            tmpG_RL_known_fail_mask = tmpG_RL < -1
            tmpG_DL_known_fail_mask = tmpG_DL < -1
            

            # To be comparable we must arrive at a consistent set of individuals from which to compare both 
            # methods. 
            tmpG_known_fail_mask = np.asarray(tmpG_RL_known_fail_mask) | np.asarray(tmpG_DL_known_fail_mask)
            
            #print tmpG_known_fail_mask
            
            failed_ct = len(tmpG_RL[tmpG_known_fail_mask])
            
            for p in np.asarray(person_ids)[tmpG_known_fail_mask]:
                failed_ids.add(p)
            

            # Filter out known solve fails.
            tmpG_RL = list(np.asarray(tmpG_RL)[~tmpG_known_fail_mask])
            tmpG_DL = list(np.asarray(tmpG_DL)[~tmpG_known_fail_mask])
            
            if not len(tmpG_RL) == len(tmpG_DL):
                raise Exception("SHOULD NOT OCCUR 5g4dfg65")
            
            if (np.asarray(tmpG_RL) < 0).any():
                raise Exception("ERROR. lsdkfal")
            
                
            LoP_RL[-1].append(np.average(tmpG_RL))
            LoP_DL[-1].append(np.average(tmpG_DL))
            LoP_failed_ct[-1].append( failed_ct )
                

            
    mlab.closePool()
    

    
    save_results( file_name, LoP_RL, 'RL')
    save_results( file_name, LoP_DL, 'DL')
    
    f2 = file(file_name + "_failed_ct.csv", 'w')
    print 'failed_ids = {}.'.format( failed_ids )
    
    np.savetxt(f2, LoP_failed_ct,fmt ="%.5f")
    f2.close()
    
    print "Done in {} seconds".format(time.time() - t)
예제 #4
0
def run(group="All",
        scale=None,
        output_dir='./ResultsLoP_replication/final_graphs',
        bulk_build_preprocessing=False):
    """
    Generates a single heatmap for a given list of Geolife ids, for a given method of computing the upper bound on
    the upper limit of predictability.
    
    :param group: ["id_str",[list of ids in the geolife dataset]]
    :type group: Nested list
    :param scale: [min_z, max_z, step]  Set the scale of the heatmap z
    :type scale: Float array
    """
    t = time.time()

    #Group setting
    if (group == "All"):
        suffix = "All"
        persons = "All"
    else:
        suffix = "Grp{}".format(group[0])
        persons = group[1]

    if not output_dir[-1] == '/':
        output_dir = output_dir + '/'

    file_name = "{}Heatmap_{}".format(output_dir, suffix)

    ensure_dir(file_name)

    print "Calculing the LoP for {}".format(suffix)

    if bulk_build_preprocessing:
        # will attempt to bulk build the cache using multiple CPU cores
        # will skip caches if already built.
        # if this option is not specified and a cache does not exist
        # it will be built when required, using a single CPU core.
        GeolifeSymbolisation.bulk_build_resolution_cache(
            listSpatialRes, listTemporalRes)

    mlab.openPool()
    failed_ids = set()
    LoP_RL = []
    LoP_DL = []
    LoP_failed_ct = []
    passed_norm_test = []
    for spatialRes in listSpatialRes:
        LoP_RL.append([])
        LoP_DL.append([])
        LoP_failed_ct.append([])
        passed_norm_test.append([])
        for temporalRes in listTemporalRes:

            #Compute data

            #---------------------------------------------
            #Load data from an existing preproc database, this will have been created
            # earlier if it did not exist.
            data, person_ids = get_geolife_data(spatialRes, temporalRes,
                                                persons)
            #---------------------------------------------

            # Sanity check on loading
            for person in data:
                if len(person) == 0:
                    raise Exception(
                        "One or more person's trajectory was not loaded/created correctly."
                    )
            # End sanity check

            S_RL, N_RL = empiricalEntropyRate(data, 'RL')
            S_DL, N_DL = empiricalEntropyRate(data, 'DL')

            #Save the average:

            tmpG_RL = list(mlab.ParLoP(S_RL, N_RL)[0])
            tmpG_DL = list(mlab.ParLoP(S_DL, N_DL)[0])

            #-88 real fail in solve
            #-99 known fail in solve when S > log2(N)
            # See the Matlab script (ParLoP.m) for more details

            if (np.asarray(tmpG_RL) == -88).any():
                raise Exception(
                    "ERROR: (RL) Matlab failed the solve, but the entropy was in the correct range. Therefore an unknown error has occured."
                )

            if (np.asarray(tmpG_DL) == -88).any():
                raise Exception(
                    "ERROR: (DL) Matlab failed the solve, but the entropy was in the correct range. Therefore an unknown error has occured."
                )

            # Replace known solve fails. These are the cases when an entropy is found that is to high.
            # This means the LZ entropy rate estimate is wrong (the estimator has failed to converge)
            # There is no way to correct this, without collecting more data from the individual.
            # While excluding the individual is not ideal it is better than including a value that is
            # *known* to be erroneous. Therefore we discard the individual.
            tmpG_RL = np.asarray(tmpG_RL)
            tmpG_DL = np.asarray(tmpG_DL)

            tmpG_RL_known_fail_mask = tmpG_RL < -1
            tmpG_DL_known_fail_mask = tmpG_DL < -1

            # To be comparable we must arrive at a consistent set of individuals from which to compare both
            # methods.
            tmpG_known_fail_mask = np.asarray(
                tmpG_RL_known_fail_mask) | np.asarray(tmpG_DL_known_fail_mask)

            #print tmpG_known_fail_mask

            failed_ct = len(tmpG_RL[tmpG_known_fail_mask])

            for p in np.asarray(person_ids)[tmpG_known_fail_mask]:
                failed_ids.add(p)

            # Filter out known solve fails.
            tmpG_RL = list(np.asarray(tmpG_RL)[~tmpG_known_fail_mask])
            tmpG_DL = list(np.asarray(tmpG_DL)[~tmpG_known_fail_mask])

            if not len(tmpG_RL) == len(tmpG_DL):
                raise Exception("SHOULD NOT OCCUR 5g4dfg65")

            if (np.asarray(tmpG_RL) < 0).any():
                raise Exception("ERROR. lsdkfal")

            LoP_RL[-1].append(np.average(tmpG_RL))
            LoP_DL[-1].append(np.average(tmpG_DL))
            LoP_failed_ct[-1].append(failed_ct)

    mlab.closePool()

    save_results(file_name, LoP_RL, 'RL')
    save_results(file_name, LoP_DL, 'DL')

    f2 = file(file_name + "_failed_ct.csv", 'w')
    print 'failed_ids = {}.'.format(failed_ids)

    np.savetxt(f2, LoP_failed_ct, fmt="%.5f")
    f2.close()

    print "Done in {} seconds".format(time.time() - t)