def sample_choice(prob_array, method="MC"): """sample chosen index given probability in prob_array prob_array - 2-d array for probability of being chosen, with probablities for a agent at one row, and probabilities for alternatives at columns method - the method used to sample choice, either MC (Monte Carlo) or max_prob """ if prob_array.ndim <> 2: raise RuntimeError, "prob_array must be a 2d array" rows, columns = prob_array.shape sum_prob_by_col = sum(prob_array, axis=1, dtype=float64) if not ma.allclose(sum_prob_by_col, ones((rows, ))): strange_rows = where(sum_prob_by_col != ones((rows, ))) raise RuntimeError, "prob_array must add up to 1 for each row. Abnormal rows: %s" % prob_array[ strange_rows, :] if method.lower() == "mc": cum_prob = ncumsum(prob_array, axis=1) R = uniform(0, 1, rows) R.resize((rows, 1)) match = (R < cum_prob) choices = argmax( match, axis=1) # return the first index of 1 in each row elif method.lower() == "max_prob": choices = argmax(prob_array) if choices.size <> rows: raise RuntimeError, "having problems sample choice" return (arange(rows), choices)
def run(self, individual_dataset, fraction_dataset, id_name1='blockgroup_id', id_name2='zone_id', fraction_attribute_name='fraction'): """ """ assert id_name1 in individual_dataset.get_known_attribute_names() if id_name2 not in individual_dataset.get_known_attribute_names(): individual_dataset.add_primary_attribute(-1*ones(individual_dataset.size()), id_name2) fraction_id1 = fraction_dataset.get_attribute(id_name1) individual_id1 = individual_dataset.get_attribute(id_name1) unique_ids = unique(fraction_id1) for id1 in unique_ids: individual_of_id1 = where(individual_id1==id1)[0] n = individual_of_id1.size logger.log_status("Processing %s %s: %s individuals" % (id_name1, id1, n) ) if n > 0: fractions = fraction_dataset.get_attribute(fraction_attribute_name)[fraction_id1==id1] id2 = fraction_dataset.get_attribute(id_name2)[fraction_id1==id1] ## ignore households in geography with sum of fractions less than 1.0e-6 if fractions.sum() < 1.0e-2: continue if not allclose(fractions.sum(), 1.0, rtol=1.e-2): fractions = normalize(fractions) fractions_cumsum = ncumsum(fractions) R = random(n) index = searchsorted(fractions_cumsum, R) individual_dataset.modify_attribute(id_name2, id2[index], index=individual_of_id1)
def __init__(self, config): ss = SimulationState(new_instance=True) ss.set_current_time(config['base_year']) ss.set_cache_directory(config['cache_directory']) SessionConfiguration(new_instance=True, package_order=config['dataset_pool_configuration'].package_order, in_storage=AttributeCache()) #if not os.path.exists(config['cache_directory']): ## if cache exists, it will automatically skip cacher = CreateBaseyearCache() cache_dir = cacher.run(config) if 'estimation_database_configuration' in config: db_server = DatabaseServer(config['estimation_database_configuration']) db = db_server.get_database(config['estimation_database_configuration'].database_name) out_storage = StorageFactory().get_storage( 'sql_storage', storage_location = db) else: output_cache = os.path.join(config['cache_directory'], str(config['base_year']+1)) out_storage = StorageFactory().get_storage('flt_storage', storage_location=output_cache) dataset_pool = SessionConfiguration().get_dataset_pool() households = dataset_pool.get_dataset("household") buildings = dataset_pool.get_dataset("building") zones = dataset_pool.get_dataset("zone") zone_ids = zones.get_id_attribute() capacity_attribute_name = "residential_units" #_of_use_id_%s" % id capacity_variable_name = "%s=sanfrancisco.zone.aggregate_%s_from_building" % \ (capacity_attribute_name, capacity_attribute_name) buildings.compute_variables("sanfrancisco.building.zone_id", dataset_pool=dataset_pool) zones.compute_variables(capacity_variable_name, dataset_pool=dataset_pool) building_zone_id = buildings.get_attribute('zone_id') # is_household_unplace = datasets['household'].get_attribute("building_id") <= 0 is_household_unplaced = 1 #all households are unplaced household_building_id = zeros(households.size(), dtype='int32')-1 #datasets['household'].get_attribute("building_id") for zone_id in zone_ids: capacity = zones.get_attribute_by_id(capacity_attribute_name, zone_id) is_household_in_this_zone = (households.get_attribute('zone_id') == zone_id) is_unplaced_household_in_this_zone = is_household_in_this_zone * is_household_unplaced is_building_in_this_zone = (building_zone_id == zone_id) # if not is_household_in_this_zone.sum() <= capacity: if capacity == 0 or is_household_in_this_zone.sum()==0: print "WARNING: zone %s has %s households but only %s units" % (zone_id, is_household_in_this_zone.sum(), capacity) continue prob = buildings.get_attribute(capacity_attribute_name) * is_building_in_this_zone / array(capacity, dtype=float64) r = random(sum(is_unplaced_household_in_this_zone)) prob_cumsum = ncumsum(prob) index_to_bldg = searchsorted(prob_cumsum, r) household_building_id[where(is_unplaced_household_in_this_zone)] = buildings.get_attribute_by_index('building_id', index_to_bldg) # import pdb;pdb.set_trace() households.set_values_of_one_attribute('building_id', household_building_id) households.write_dataset(out_table_name='households', out_storage=out_storage)
def sample_choice(prob_array, method="MC"): """sample chosen index given probability in prob_array prob_array - 2-d array for probability of being chosen, with probablities for a agent at one row, and probabilities for alternatives at columns method - the method used to sample choice, either MC (Monte Carlo) or max_prob """ if prob_array.ndim <> 2: raise RuntimeError, "prob_array must be a 2d array" rows, columns = prob_array.shape if not ma.allclose(sum(prob_array, axis=1, dtype=float64), ones((rows,))): raise RuntimeError, "prob_array must add up to 1 for each row" if method.lower() == "mc": cum_prob = ncumsum(prob_array, axis=1) R = uniform(0, 1, rows) ## new R spec R.resize((rows, 1)) # R = random((rows,1)) ## preOPUS4 specification of R - added 8 jul 09 match = R < cum_prob choices = argmax(match, axis=1) # return the first index of 1 in each row elif method.lower() == "max_prob": choices = argmax(prob_array) if choices.size <> rows: raise RuntimeError, "having problems sample choice" return (arange(rows), choices)
def run(self, individual_dataset, fraction_dataset, id_name1='blockgroup_id', id_name2='zone_id', fraction_attribute_name='fraction', dataset_pool=None): """ """ if dataset_pool is None: dataset_pool = SessionConfiguration().get_dataset_pool() if isinstance(individual_dataset, str): individual_dataset = dataset_pool[individual_dataset] if isinstance(fraction_dataset, str): fraction_dataset = dataset_pool[fraction_dataset] assert id_name1 in individual_dataset.get_known_attribute_names() if id_name2 not in individual_dataset.get_known_attribute_names(): dtype = fraction_dataset.get_attribute(id_name2).dtype default_values = -1*ones(individual_dataset.size(), dtype=dtype) individual_dataset.add_primary_attribute(default_values, id_name2) fraction_id1 = fraction_dataset.get_attribute(id_name1) individual_id1 = individual_dataset.get_attribute(id_name1) unique_ids = unique(fraction_id1) for id1 in unique_ids: individual_of_id1 = where(individual_id1==id1)[0] n = individual_of_id1.size logger.log_status("Processing %s %s: %s individuals" % (id_name1, id1, n) ) if n > 0: fractions = fraction_dataset.get_attribute(fraction_attribute_name)[fraction_id1==id1] id2 = fraction_dataset.get_attribute(id_name2)[fraction_id1==id1] ## ignore individuals in geography with sum of fractions less than 1.0e-2 if fractions.sum() < 1.0e-2: continue if not allclose(fractions.sum(), 1.0, rtol=1.e-2): fractions = normalize(fractions) fractions_cumsum = ncumsum(fractions) R = random(n) index = searchsorted(fractions_cumsum, R) individual_dataset.modify_attribute(id_name2, id2[index], index=individual_of_id1) individual_dataset.flush_dataset()
def probsample_replace(source_array, size, prob_array, return_index=False): """Unequal probability sampling; with replacement case. Using numpy searchsorted function, suitable for large array""" if not isinstance(source_array, ndarray): try: source_array = asarray(source_array) except: raise TypeError, "source_array must be of type ndarray" if prob_array is None: return sample_replace(source_array, size, return_index=return_index) if prob_array.sum() == 0: raise ValueError, "there aren't non-zero weights in prob_array" cum_prob = ncumsum(prob_array) sample_prob = uniform(0, 1, size) sampled_index = searchsorted(cum_prob, sample_prob) if return_index: return sampled_index else: return source_array[sampled_index]
def probsample_noreplace(source_array, sample_size, prob_array=None, exclude_index=None, return_index=False): """generate non-repeat random 1d samples from source_array of sample_size, excluding indices appeared in exclude_index. return indices to source_array if return_index is true. source_array - the source array to sample from sample_size - scalar representing the sample size prob_array - the array used to weight sample exclude_index - array representing indices should not appear in resulted array, which can be used, for example, to exclude current choice from sampling, indexed to source_array """ if not isinstance(source_array, ndarray): try: source_array = asarray(source_array) except: raise TypeError, "source_array must be of type ndarray" pop_size = source_array.size if pop_size < sample_size: logger.log_warning("There are less or equal indices (%s) in source_array than the sample_size (%s). Use probsample_replace. " % (pop_size, sample_size)) #sample_size = max return probsample_replace(source_array, sample_size, prob_array=prob_array, return_index=return_index) elif pop_size == sample_size: if return_index: return arange(source_array.size) else: return source_array if sample_size <= 0: #logger.log_warning("sample_size is %s. Nothing is sampled." % sample_size) return array([], dtype='i') if prob_array is None: return sample_noreplace(source_array, sample_size, return_index=return_index) if not isinstance(prob_array, ndarray): try: prob_array = asarray(prob_array) except: raise TypeError, "prob_array must be of type ndarray" p_array = prob_array.astype(float64) # creates a copy (not just a pointer) p_array_sum = p_array.sum() if not ma.allclose(p_array_sum, 1.0): p_array = p_array / p_array_sum if abs(1.0 - p_array_sum) > 0.01: # print this message only if it is a serious difference logger.log_warning("prob_array doesn't sum up to 1, and is normalized. Sum: %s" % p_array_sum) #import pdb; pdb.set_trace() prob_array_size = nonzerocounts(prob_array) if prob_array_size < sample_size: logger.log_warning("There are less or equal non-zero weight (%s) in prob_array than the sample_size (%s). Use probsample_replace. " % (prob_array_size, sample_size)) return probsample_replace(source_array, sample_size, prob_array=p_array, return_index=return_index) elif prob_array_size == sample_size: if return_index: return where(prob_array>0)[0] else: return source_array[prob_array>0] totalmass = 1.0 to_be_sampled = sample_size sampled_index = array([], dtype='i') #initialize sampled_index if exclude_index is not None: try: totalmass -= asarray(p_array[exclude_index]).sum() p_array[exclude_index] = 0 except IndexError: logger.log_warning("The exclude_index (%s) is not in prob array" % (exclude_index)) #raise IndexError, "Having problem to apply exclude_index values" cum_prob = ncumsum(p_array/p_array.sum()) * totalmass if not ma.allclose(cum_prob[-1], totalmass): raise ValueError, "prob_array doesn't sum up to 1 even after normalization" while True: sample_prob = uniform(0, totalmass, to_be_sampled).astype(float32) proposed_index = searchsorted(cum_prob, sample_prob) #, 0, cum_prob.size-1) # dup_indicator = find_duplicates_self(proposed_index) # valid_index = proposed_index[dup_indicator==0] # sampled_index = concatenate((sampled_index, source_array[valid_index])) # if not sometrue(dup_indicator): # return sampled_index i = 0 if numpy.__version__ >= '1.2.0': ## numpy.unique1d in version 1.2.0 has reversed the return, changed [0]->[1] i = 1 uniqueidx = unique1d(proposed_index, True)[i] valid_index = proposed_index[sort(uniqueidx)] #valid_index = unique_values(proposed_index) #import pdb; pdb.set_trace() sampled_index = concatenate((sampled_index, valid_index)) if valid_index.size == to_be_sampled: if return_index: return sampled_index else: return source_array[sampled_index] totalmass -= asarray(p_array[valid_index]).sum() p_array[valid_index] = 0 cum_prob = ncumsum(p_array/p_array.sum()) * totalmass assert ma.allclose(totalmass, cum_prob[-1]) to_be_sampled -= valid_index.size
def prob2dsample(source_array, sample_size, prob_array=None, exclude_index=None, replace=False, return_index=False): """generate non-repeat random 2d samples from source_array of sample_size, not including indices appeared in exclude_index; sample column by column, more efficient when there are more rows than columns in sample_size. return elements in source_array of shape sample_size. source_array - the source array to sample from sample_size - tuple representing the sample size with (rows, columns), non-repeat at each row exclude_index - array representing indices should not appear in resulted array, used to exclude current choice from sampling prob_array - the array used to weight sample """ rows, columns = sample_size source_array_size = source_array.size if source_array_size <= columns: logger.log_warning("There are less or equal indices (%s) in source_array than the sample_size (%s). Sample %s." % (source_array_size, columns, source_array_size)) return ones((rows,1)) * source_array[newaxis,:] if prob_array is None: prob_array = ones(source_array_size) if not (isinstance(prob_array, ndarray) or is_masked_array(prob_array)): raise TypeError, "prob_array must be of type ndarray" # prob_array_size = nonzerocounts(prob_array) # if prob_array_size <= columns: # logger.log_warning( "there are less or equal non-zero weight (%s) in prob_array than the sample_size (%s). Sample %s instead. " %\ # (prob_array_size, columns, prob_array_size)) # return ones((rows,1)) * source_array[nonzero(prob_array)][newaxis,:] p_array = prob_array p_array_sum = p_array.sum(dtype="float64") if not ma.allclose(p_array_sum, 1.0): if abs(1.0 - p_array_sum) > 0.01: # print this message only if it is a serious difference logger.log_warning("prob_array doesn't sum up to 1, and is normalized. Sum: %s" % p_array_sum) p_array = p_array / p_array_sum cum_prob = ncumsum(p_array) sampled_choiceset_index = zeros(sample_size, dtype="int32") - 1 #initialize output if not replace: if exclude_index is not None: if not isinstance(exclude_index, ndarray): try: exclude_index = asarray(exclude_index) except: raise TypeError, "exclude_index must be of type ndarray" if exclude_index.shape[0] <> rows: raise ValueError, "exclude_index should have the same number of rows as sample_size[0]" if rank(exclude_index) == 1: exclude_index = exclude_index[:, newaxis] #sampled_choiceset_index = concatenate((exclude_index,sampled_choiceset_index),axis=1) #attach exclude_index to the beginning of sampled_choiceset_index else: exclude_index = zeros(shape=(sample_size[0],1), dtype="int32") for j in range(columns): slots_to_be_sampled = arange(rows) #proposed_index = zeros((rows,1)) - 1 while True: proposed_index = probsample_replace(arange(source_array_size), slots_to_be_sampled.size, p_array) try: exclude_array = exclude_index[slots_to_be_sampled,] except: exclude_array = None duplicate_indicator = find_duplicates_others(proposed_index, exclude_array) valid_index = slots_to_be_sampled[duplicate_indicator==0] sampled_choiceset_index[valid_index, j] = proposed_index[duplicate_indicator==0] if nonzerocounts(duplicate_indicator) == 0: break slots_to_be_sampled = slots_to_be_sampled[duplicate_indicator>0] exclude_index = concatenate((exclude_index, take(sampled_choiceset_index,(j,), axis=1)), axis = 1) else: for j in range(columns): sampled_choiceset_index[:,j] = probsample_replace(arange(source_array_size), rows, p_array) if return_index: return sampled_choiceset_index else: return source_array[sampled_choiceset_index]
def prob2dsample(source_array, sample_size, prob_array=None, exclude_index=None, replace=False, return_index=False): """generate non-repeat random 2d samples from source_array of sample_size, not including indices appeared in exclude_index; sample column by column, more efficient when there are more rows than columns in sample_size. return elements in source_array of shape sample_size. source_array - the source array to sample from sample_size - tuple representing the sample size with (rows, columns), non-repeat at each row exclude_index - array representing indices should not appear in resulted array, used to exclude current choice from sampling prob_array - the array used to weight sample """ rows, columns = sample_size source_array_size = source_array.size if source_array_size <= columns and not replace: logger.log_warning( "There are less or equal indices (%s) in source_array than the sample_size (%s). Sample %s." % (source_array_size, columns, source_array_size)) if return_index: return ones( (rows, 1), dtype='i') * arange(source_array_size)[newaxis, :] else: return ones((rows, 1), dtype='i') * source_array[newaxis, :] if prob_array is None: prob_array = ones(source_array_size) if not (isinstance(prob_array, ndarray) or is_masked_array(prob_array)): raise TypeError, "prob_array must be of type ndarray" # prob_array_size = nonzerocounts(prob_array) # if prob_array_size <= columns: # logger.log_warning( "there are less or equal non-zero weight (%s) in prob_array than the sample_size (%s). Sample %s instead. " %\ # (prob_array_size, columns, prob_array_size)) # return ones((rows,1)) * source_array[nonzero(prob_array)][newaxis,:] p_array = prob_array p_array_sum = p_array.sum(dtype="float64") if not ma.allclose(p_array_sum, 1.0): if abs(1.0 - p_array_sum) > 0.01: # print this message only if it is a serious difference logger.log_warning( "prob_array doesn't sum up to 1, and is normalized. Sum: %s" % p_array_sum) p_array = p_array / p_array_sum cum_prob = ncumsum(p_array) sampled_choiceset_index = zeros( sample_size, dtype="int32") - 1 #initialize output if not replace: if exclude_index is not None: if not isinstance(exclude_index, ndarray): try: exclude_index = asarray(exclude_index) except: raise TypeError, "exclude_index must be of type ndarray" if exclude_index.shape[0] <> rows: raise ValueError, "exclude_index should have the same number of rows as sample_size[0]" if rank(exclude_index) == 1: exclude_index = exclude_index[:, newaxis] #sampled_choiceset_index = concatenate((exclude_index,sampled_choiceset_index),axis=1) #attach exclude_index to the beginning of sampled_choiceset_index else: exclude_index = zeros(shape=(sample_size[0], 1), dtype="int32") for j in range(columns): slots_to_be_sampled = arange(rows) #proposed_index = zeros((rows,1)) - 1 while True: proposed_index = probsample_replace( arange(source_array_size), slots_to_be_sampled.size, p_array) try: exclude_array = exclude_index[slots_to_be_sampled, ] except: exclude_array = None duplicate_indicator = find_duplicates_others( proposed_index, exclude_array) valid_index = slots_to_be_sampled[duplicate_indicator == 0] sampled_choiceset_index[valid_index, j] = proposed_index[ duplicate_indicator == 0] if nonzerocounts(duplicate_indicator) == 0: break slots_to_be_sampled = slots_to_be_sampled[ duplicate_indicator > 0] exclude_index = concatenate( (exclude_index, take(sampled_choiceset_index, (j, ), axis=1)), axis=1) else: for j in range(columns): sampled_choiceset_index[:, j] = probsample_replace( arange(source_array_size), rows, p_array) if return_index: return sampled_choiceset_index else: return source_array[sampled_choiceset_index]
def probsample_noreplace(source_array, sample_size, prob_array=None, exclude_index=None, return_index=False): """generate non-repeat random 1d samples from source_array of sample_size, excluding indices appeared in exclude_index. return indices to source_array if return_index is true. source_array - the source array to sample from sample_size - scalar representing the sample size prob_array - the array used to weight sample exclude_index - array representing indices should not appear in resulted array, which can be used, for example, to exclude current choice from sampling, indexed to source_array """ if not isinstance(source_array, ndarray): try: source_array = asarray(source_array) except: raise TypeError, "source_array must be of type ndarray" pop_size = source_array.size if pop_size < sample_size: logger.log_warning( "There are less or equal indices (%s) in source_array than the sample_size (%s). Use probsample_replace. " % (pop_size, sample_size) ) # sample_size = max return probsample_replace(source_array, sample_size, prob_array=prob_array, return_index=return_index) elif pop_size == sample_size: if return_index: return arange(source_array.size) else: return source_array if sample_size <= 0: # logger.log_warning("sample_size is %s. Nothing is sampled." % sample_size) return array([], dtype="i") if prob_array is None: return sample_noreplace(source_array, sample_size, return_index=return_index) if not isinstance(prob_array, ndarray): try: prob_array = asarray(prob_array) except: raise TypeError, "prob_array must be of type ndarray" p_array = prob_array.astype(float64) # creates a copy (not just a pointer) p_array_sum = p_array.sum() if not ma.allclose(p_array_sum, 1.0): p_array = p_array / p_array_sum if abs(1.0 - p_array_sum) > 0.01: # print this message only if it is a serious difference logger.log_warning("prob_array doesn't sum up to 1, and is normalized. Sum: %s" % p_array_sum) # import pdb; pdb.set_trace() prob_array_size = nonzerocounts(prob_array) if prob_array_size < sample_size: logger.log_warning( "There are less or equal non-zero weight (%s) in prob_array than the sample_size (%s). Use probsample_replace. " % (prob_array_size, sample_size) ) return probsample_replace(source_array, sample_size, prob_array=p_array, return_index=return_index) elif prob_array_size == sample_size: if return_index: return where(prob_array > 0)[0] else: return source_array[prob_array > 0] totalmass = 1.0 to_be_sampled = sample_size sampled_index = array([], dtype="i") # initialize sampled_index if exclude_index is not None: try: totalmass -= asarray(p_array[exclude_index]).sum() p_array[exclude_index] = 0 except IndexError: logger.log_warning("The exclude_index (%s) is not in prob array" % (exclude_index)) # raise IndexError, "Having problem to apply exclude_index values" cum_prob = ncumsum(p_array / p_array.sum()) * totalmass if not ma.allclose(cum_prob[-1], totalmass): raise ValueError, "prob_array doesn't sum up to 1 even after normalization" while True: sample_prob = uniform(0, totalmass, to_be_sampled).astype(float32) proposed_index = searchsorted(cum_prob, sample_prob) # , 0, cum_prob.size-1) # dup_indicator = find_duplicates_self(proposed_index) # valid_index = proposed_index[dup_indicator==0] # sampled_index = concatenate((sampled_index, source_array[valid_index])) # if not sometrue(dup_indicator): # return sampled_index i = 0 if numpy.__version__ >= "1.2.0": ## numpy.unique1d in version 1.2.0 has reversed the return, changed [0]->[1] i = 1 uniqueidx = unique1d(proposed_index, True)[i] valid_index = proposed_index[sort(uniqueidx)] # valid_index = unique_values(proposed_index) # import pdb; pdb.set_trace() sampled_index = concatenate((sampled_index, valid_index)) if valid_index.size == to_be_sampled: if return_index: return sampled_index else: return source_array[sampled_index] totalmass -= asarray(p_array[valid_index]).sum() p_array[valid_index] = 0 cum_prob = ncumsum(p_array / p_array.sum()) * totalmass assert ma.allclose(totalmass, cum_prob[-1]) to_be_sampled -= valid_index.size