def _update_household_set(self, household_set): """Updates also person set.""" hh_ids_to_copy = household_set.get_id_attribute()[self.mapping_existing_hhs_to_new_hhs] npersons_in_hhs = household_set.get_attribute_by_index('persons', self.mapping_existing_hhs_to_new_hhs) result = USHouseholdTransitionModel._update_household_set(self, household_set) new_hh_ids = household_set.get_id_attribute()[(household_set.size()-self.mapping_existing_hhs_to_new_hhs.size):household_set.size()] # remove person that have non-existing households eliminate_index = where(logical_not(ismember(self.person_set.get_attribute(household_set.get_id_name()[0]), household_set.get_id_attribute())))[0] self.person_set.remove_elements(eliminate_index) # duplicate persons unique_persons_to_duplicate = ismember(self.person_set.get_attribute(household_set.get_id_name()[0]), hh_ids_to_copy) person_considered_idx = where(unique_persons_to_duplicate)[0] npersons_in_hhs_sum = npersons_in_hhs.sum() persons_to_duplicate = -1*ones(npersons_in_hhs_sum, dtype=int32) new_person_hh_ids = zeros(npersons_in_hhs_sum, dtype=int32) considered_person_hh_ids = self.person_set.get_attribute(household_set.get_id_name()[0])[person_considered_idx] j = 0 for i in arange(hh_ids_to_copy.size): idx = where(considered_person_hh_ids == hh_ids_to_copy[i])[0] if idx.size == npersons_in_hhs[i]: persons_to_duplicate[j:(j+npersons_in_hhs[i])] = person_considered_idx[idx] new_person_hh_ids[j:(j+npersons_in_hhs[i])] = new_hh_ids[i] j+=npersons_in_hhs[i] if hh_ids_to_copy.size > 0: if j < npersons_in_hhs_sum: persons_to_duplicate = persons_to_duplicate[0:j] new_person_hh_ids = new_person_hh_ids[0:j] if persons_to_duplicate.size <= 0: return result new_persons_idx = self.person_set.duplicate_rows(persons_to_duplicate) # assign job_id to 'no job', but only if the job_id is present in the person dataset dtype=self.person_set.get_data_type('job_id') if dtype!=None: self.person_set.modify_attribute(name='job_id', data=zeros(new_persons_idx.size, dtype=dtype), index=new_persons_idx) # assign the right household_id self.person_set.modify_attribute(name=household_set.get_id_name()[0], data=new_person_hh_ids, index=new_persons_idx) self.debug.print_debug("Created %s persons." % new_persons_idx.size, 3) # check if number of persons in the household_set correspond to those in person set if household_set.get_attribute('persons').sum() <> self.person_set.size(): logger.log_warning('Number of persons in household set (%s) does not correspond to those in person set (%s).' % (household_set.get_attribute('persons').sum(), self.person_set.size())) return result
def _update_household_set(self, household_set): """Updates also person set.""" hh_ids_to_copy = household_set.get_id_attribute()[self.mapping_existing_hhs_to_new_hhs] npersons_in_hhs = household_set.get_attribute_by_index('persons', self.mapping_existing_hhs_to_new_hhs) result = USHouseholdTransitionModel._update_household_set(self, household_set) new_hh_ids = household_set.get_id_attribute()[(household_set.size()-self.mapping_existing_hhs_to_new_hhs.size):household_set.size()] # remove person that have non-existing households eliminate_index = where(logical_not(ismember(self.person_set.get_attribute(household_set.get_id_name()[0]), household_set.get_id_attribute())))[0] self.person_set.remove_elements(eliminate_index) # duplicate persons unique_persons_to_duplicate = ismember(self.person_set.get_attribute(household_set.get_id_name()[0]), hh_ids_to_copy) person_considered_idx = where(unique_persons_to_duplicate)[0] npersons_in_hhs_sum = npersons_in_hhs.sum() persons_to_duplicate = -1*ones(npersons_in_hhs_sum, dtype=int32) new_person_hh_ids = zeros(npersons_in_hhs_sum, dtype=int32) considered_person_hh_ids = self.person_set.get_attribute(household_set.get_id_name()[0])[person_considered_idx] j = 0 for i in arange(hh_ids_to_copy.size): idx = where(considered_person_hh_ids == hh_ids_to_copy[i])[0] if idx.size == npersons_in_hhs[i]: persons_to_duplicate[j:(j+npersons_in_hhs[i])] = person_considered_idx[idx] new_person_hh_ids[j:(j+npersons_in_hhs[i])] = new_hh_ids[i] j+=npersons_in_hhs[i] if hh_ids_to_copy.size > 0: if j < npersons_in_hhs_sum: persons_to_duplicate = persons_to_duplicate[0:j] new_person_hh_ids = new_person_hh_ids[0:j] if persons_to_duplicate.size <= 0: return result new_persons_idx = self.person_set.duplicate_rows(persons_to_duplicate) # assign job_id to 'no job' self.person_set.modify_attribute(name='job_id', data=zeros(new_persons_idx.size, dtype=self.person_set.get_data_type('job_id')), index=new_persons_idx) # assign the right household_id self.person_set.modify_attribute(name=household_set.get_id_name()[0], data=new_person_hh_ids, index=new_persons_idx) self.debug.print_debug("Created %s persons." % new_persons_idx.size, 3) # check if number of persons in the household_set correspond to those in person set if household_set.get_attribute('persons').sum() <> self.person_set.size(): logger.log_warning('Number of persons in household set (%s) does not correspond to those in person set (%s).' % (household_set.get_attribute('persons').sum(), self.person_set.size())) return result
def prepare_for_run(self, agent_set=None, agent_filter=None, agents_index=None, convert_index_to_person=False, filter_threshold=0, **kwargs): """Combine agent_filter and agents_index. If convert_index_to_person is True, it means agents_index is an index of households whereas agent_set is a person dataset. Thus, a conversion need to be done.""" spec, coef, index = ChoiceModel.prepare_for_run(self, agent_set=agent_set, agent_filter=agent_filter, filter_threshold=filter_threshold, **kwargs) if agents_index is not None: if convert_index_to_person: hhs = self.dataset_pool.get_dataset('household') agents_index = where(ismember(agent_set['%s' % hhs.get_id_name()[0]], hhs.get_id_attribute()[agents_index])) tmp1 = zeros(agent_set.size(), dtype='bool8') tmp1[agents_index] = True if index is not None: tmp2 = zeros(agent_set.size(), dtype='bool8') tmp2[index] = True tmp1 = logical_and(tmp1, tmp2) index = where(tmp1)[0] return (spec, coef, index)
def run(self, dataset_pool): workers = dataset_pool['person'] faz_ids = workers.compute_variables('faz_id = person.disaggregate(zone.faz_id, intermediates=[parcel, building, household])', dataset_pool=dataset_pool) is_worker = workers.compute_variables('urbansim_parcel.person.is_worker', dataset_pool=dataset_pool) workers_jobs = workers['job_id'] job_ids = arange(self.job_id_range[0], self.job_id_range[1]+1) for area, values in self.faz_worker_mapping.iteritems(): fazes = array(values[0]) amount = values[1] indicator = logical_and(ismember(faz_ids, fazes), is_worker) job_idx = where(job_ids > 0)[0] sampled_jobs = sample_noreplace(job_idx, amount) workers_idx = where(indicator > 0)[0] sampled_workers = sample_noreplace(workers_idx, amount) workers_jobs[sampled_workers] = job_ids[sampled_jobs] job_ids[sampled_jobs] = 0 workers.modify_attribute(name='job_id', data=workers_jobs)
def run(self, specification, coefficients, agent_set, agents_index=None, sync_persons=False, **kwargs): """Set sync_persons to True if the model is run on households level and the persons table should be synchronized. """ results = ChoiceModel.run(self, specification, coefficients, agent_set, agents_index=agents_index, **kwargs) if sync_persons: persons = self.dataset_pool.get_dataset('person') choice_id_name = self.choice_set.get_id_name()[0] values = persons.compute_variables( ['_tmp_ = person.disaggregate(%s.%s)' % (agent_set.get_dataset_name(), choice_id_name)], dataset_pool=self.dataset_pool) if agents_index==None: agents_index=arange(agent_set.size()) pers_idx = where(ismember(persons['%s' % agent_set.get_id_name()[0]], agent_set.get_id_attribute()[agents_index])) if choice_id_name not in persons.get_known_attribute_names(): persons.add_primary_attribute(data=zeros(persons.size(), dtype=values.dtype), name=choice_id_name) persons.modify_attribute(data=values, name=choice_id_name, index=pers_idx) persons.delete_one_attribute('_tmp_') agent_set.modify_attribute(data=results, name=self.choice_attribute_name.get_alias(), index=agents_index) return results
def run(self, dataset, secondary_dataset, index=None, attribute_to_be_modified=None, value=0, filter=None, dataset_pool=None): """ 'dataset' must contain an attribute of the same name as the id attribute of the secondary_dataset (join_attribute). The model finds members of 'dataset' for which the values of the join_attribute correspond to values of that attribute of the secondary_dataset (possibly restricted by the 'index' and/or 'filter' which is an expression). For all those members is the attribute 'attribute_to_be_modified' changed to 'value'. If 'attribute_to_be_modified' is not given, the 'join_attribute' is modified. """ if index is None: index = arange(secondary_dataset.size()) if filter is not None: members_to_consider = zeros(secondary_dataset.size(), dtype='bool8') members_to_consider[index] = True pass_filter = secondary_dataset.compute_variables( [filter], dataset_pool=dataset_pool) > 0 members_to_consider = logical_and(members_to_consider, pass_filter) index = where(members_to_consider)[0] ids = secondary_dataset.get_id_attribute()[index] join_attribute = secondary_dataset.get_id_name()[0] members_idx = where( ismember(dataset.get_attribute(join_attribute), ids))[0] if attribute_to_be_modified is None: attribute_to_be_modified = join_attribute dataset.modify_attribute(name=attribute_to_be_modified, data=array(index.size * [value]), index=members_idx) logger.log_status("%s values of %s.%s are set to %s." % (members_idx.size, dataset.get_dataset_name(), attribute_to_be_modified, value))
def run(self, dataset, secondary_dataset, index=None, attribute_to_be_modified=None, value=0, filter=None, dataset_pool=None): """ 'dataset' must contain an attribute of the same name as the id attribute of the secondary_dataset (join_attribute). The model finds members of 'dataset' for which the values of the join_attribute correspond to values of that attribute of the secondary_dataset (possibly restricted by the 'index' and/or 'filter' which is an expression). For all those members is the attribute 'attribute_to_be_modified' changed to 'value'. If 'attribute_to_be_modified' is not given, the 'join_attribute' is modified. """ if index is None: index = arange(secondary_dataset.size()) if filter is not None: members_to_consider = zeros(secondary_dataset.size(), dtype='bool8') members_to_consider[index] = True pass_filter = secondary_dataset.compute_variables([filter], dataset_pool=dataset_pool) > 0 members_to_consider = logical_and(members_to_consider, pass_filter) index = where(members_to_consider)[0] ids = secondary_dataset.get_id_attribute()[index] join_attribute = secondary_dataset.get_id_name()[0] members_idx = where(ismember(dataset.get_attribute(join_attribute), ids))[0] if attribute_to_be_modified is None: attribute_to_be_modified = join_attribute dataset.modify_attribute(name=attribute_to_be_modified, data=array(index.size*[value]), index=members_idx) logger.log_status("%s values of %s.%s are set to %s." % (members_idx.size, dataset.get_dataset_name(), attribute_to_be_modified, value))
def run( self, realestate_dataset, year=None, occupied_spaces_variable="occupied_units", total_spaces_variable="total_units", target_attribute_name="target_vacancy_rate", sample_from_dataset=None, sample_filter="", reset_attribute_value={}, year_built="year_built", dataset_pool=None, append_to_realestate_dataset=False, table_name="development_projects", dataset_name="development_project", id_name="development_project_id", **kwargs ): """ sample_filter attribute/variable indicates which records in the dataset are eligible in the sampling for removal or cloning append_to_realestate_dataset - whether to append the new dataset to realestate_dataset """ if self.target_vancy_dataset is None: raise RuntimeError, "target_vacancy_rate dataset is unspecified." if not sample_from_dataset: sample_from_dataset = realestate_dataset # if dataset_pool is None: # dataset_pool = SessionConfiguration().get_dataset_pool() if year is None: year = SimulationState().get_current_time() this_year_index = where(self.target_vancy_dataset.get_attribute("year") == year)[0] target_vacancy_for_this_year = DatasetSubset(self.target_vancy_dataset, this_year_index) column_names = list( set(self.target_vancy_dataset.get_known_attribute_names()) - set([target_attribute_name, occupied_spaces_variable, total_spaces_variable, "year", "_hidden_id_"]) ) column_names.sort(reverse=True) column_values = dict( [ (name, target_vacancy_for_this_year.get_attribute(name)) for name in column_names + [target_attribute_name] ] ) independent_variables = list(set([re.sub("_max$", "", re.sub("_min$", "", col)) for col in column_names])) dataset_known_attributes = realestate_dataset.get_known_attribute_names() sample_dataset_known_attributes = sample_from_dataset.get_known_attribute_names() for variable in independent_variables: if variable not in dataset_known_attributes: realestate_dataset.compute_one_variable_with_unknown_package(variable, dataset_pool=dataset_pool) if variable not in sample_dataset_known_attributes: sample_from_dataset.compute_one_variable_with_unknown_package(variable, dataset_pool=dataset_pool) dataset_known_attributes = realestate_dataset.get_known_attribute_names() # update after compute if sample_filter: short_name = VariableName(sample_filter).get_alias() if short_name not in dataset_known_attributes: filter_indicator = sample_from_dataset.compute_variables(sample_filter, dataset_pool=dataset_pool) else: filter_indicator = sample_from_dataset.get_attribute(short_name) else: filter_indicator = 1 sampled_index = array([], dtype=int32) # log header if PrettyTable is not None: status_log = PrettyTable() status_log.set_field_names(column_names + ["actual", "target", "expected", "difference", "action"]) else: logger.log_status("\t".join(column_names + ["actual", "target", "expected", "difference", "action"])) error_log = "" for index in range(target_vacancy_for_this_year.size()): this_sampled_index = array([], dtype=int32) indicator = ones(realestate_dataset.size(), dtype="bool") sample_indicator = ones(sample_from_dataset.size(), dtype="bool") criterion = {} # for logging for attribute in independent_variables: if attribute in dataset_known_attributes: dataset_attribute = realestate_dataset.get_attribute(attribute) sample_attribute = sample_from_dataset.get_attribute(attribute) else: raise ValueError, "attribute %s used in target vacancy dataset can not be found in dataset %s" % ( attribute, realestate_dataset.get_dataset_name(), ) if attribute + "_min" in column_names: amin = target_vacancy_for_this_year.get_attribute(attribute + "_min")[index] criterion.update({attribute + "_min": amin}) if amin != -1: indicator *= dataset_attribute >= amin sample_indicator *= sample_attribute >= amin if attribute + "_max" in column_names: amax = target_vacancy_for_this_year.get_attribute(attribute + "_max")[index] criterion.update({attribute + "_max": amax}) if amax != -1: indicator *= dataset_attribute <= amax sample_indicator *= sample_attribute <= amax if attribute in column_names: aval = column_values[attribute][index] criterion.update({attribute: aval}) if aval == -1: continue elif ( aval == -2 ): ##treat -2 in control totals column as complement set, i.e. all other values not already specified in this column indicator *= logical_not(ismember(dataset_attribute, column_values[attribute])) sample_indicator *= logical_not(ismember(sample_attribute, column_values[attribute])) else: indicator *= dataset_attribute == aval sample_indicator *= sample_attribute == aval this_total_spaces_variable, this_occupied_spaces_variable = total_spaces_variable, occupied_spaces_variable ## total/occupied_spaces_variable can be specified either as a universal name for all realestate ## or in targe_vacancy_rate dataset for each vacancy category if occupied_spaces_variable in target_vacancy_for_this_year.get_known_attribute_names(): this_occupied_spaces_variable = target_vacancy_for_this_year.get_attribute(occupied_spaces_variable)[ index ] if total_spaces_variable in target_vacancy_for_this_year.get_known_attribute_names(): this_total_spaces_variable = target_vacancy_for_this_year.get_attribute(total_spaces_variable)[index] this_total_spaces_variable += "_" + str(criterion[col]) this_occupied_spaces_variable += "_" + str(criterion[col]) logger.be_quiet() # temporarily disable logging realestate_dataset.compute_one_variable_with_unknown_package( this_occupied_spaces_variable, dataset_pool=dataset_pool ) realestate_dataset.compute_one_variable_with_unknown_package( this_total_spaces_variable, dataset_pool=dataset_pool ) sample_from_dataset.compute_one_variable_with_unknown_package( this_total_spaces_variable, dataset_pool=dataset_pool ) logger.talk() actual_num = (realestate_dataset.get_attribute(this_total_spaces_variable)).sum() # target_num is obsolete with this version. target_num = int( round( (realestate_dataset.get_attribute(this_occupied_spaces_variable)).sum() / (1 - target_vacancy_for_this_year.get_attribute(target_attribute_name)[index]) ) ) """If the target vacancy is very small and the inflow to the region big it is not enough to check only the current simulation year's vacancy. The simulation is more robust if the BTM is anticipating the next year's population (of households and jobs). #TODO: Make code more general to cover various stratifications in the real estate market. """ if criterion[col] == 1: idx = where(self.control_totals.get_attribute("year") == year + 1)[0] this_years_control_totals = DatasetSubset(self.control_totals, idx) expected_num = int( round( this_years_control_totals.get_attribute("total_number_of_households").sum() / (1 - target_vacancy_for_this_year.get_attribute(target_attribute_name)[index]) ) ) if criterion[col] == 0: idx = where(self.employment_control_totals.get_attribute("year") == year + 1)[0] next_years_control_totals = DatasetSubset(self.employment_control_totals, idx) expected_num = int( round( next_years_control_totals.get_attribute("number_of_jobs").sum() / (1 - target_vacancy_for_this_year.get_attribute(target_attribute_name)[index]) ) ) diff = expected_num - actual_num # Previous version which is checking the current years occupation. # diff = target_num - actual_num if diff > 0: total_spaces_in_sample_dataset = sample_from_dataset.get_attribute(this_total_spaces_variable) legit_index = where( logical_and(sample_indicator, filter_indicator) * total_spaces_in_sample_dataset > 0 )[0] if legit_index.size > 0: mean_size = total_spaces_in_sample_dataset[legit_index].mean() num_of_projects_to_sample = int(diff / mean_size) ##sampled at least 1 project when diff > 0, otherwise it is a endless loop when num_of_projects_to_sample = 0 num_of_projects_to_sample = num_of_projects_to_sample if num_of_projects_to_sample > 0 else 1 while total_spaces_in_sample_dataset[this_sampled_index].sum() < diff: lucky_index = sample_replace(legit_index, num_of_projects_to_sample) this_sampled_index = concatenate((this_sampled_index, lucky_index)) this_sampled_index = this_sampled_index[ 0 : (1 + searchsorted(cumsum(total_spaces_in_sample_dataset[this_sampled_index]), diff)) ] sampled_index = concatenate((sampled_index, this_sampled_index)) else: error_log += ( "There is nothing to sample from %s and no new development will happen for " % sample_from_dataset.get_dataset_name() + ",".join([col + "=" + str(criterion[col]) for col in column_names]) + "\n" ) # if diff < 0: #TODO demolition; not yet supported ##log status action = "0" if this_sampled_index.size > 0: action_num = total_spaces_in_sample_dataset[this_sampled_index].sum() if diff > 0: action = "+" + str(action_num) if diff < 0: action = "-" + str(action_num) cat = [str(criterion[col]) for col in column_names] cat += [str(actual_num), str(target_num), str(expected_num), str(diff), action] if PrettyTable is not None: status_log.add_row(cat) else: logger.log_status("\t".join(cat)) if PrettyTable is not None: logger.log_status("\n" + status_log.get_string()) if error_log: logger.log_error(error_log) result_data = {} result_dataset = None index = array([], dtype="int32") if True: # sampled_index.size > 0: ### ideally duplicate_rows() is all needed to add newly cloned rows ### to be more cautious, copy the data to be cloned, remove elements, then append the cloned data ##realestate_dataset.duplicate_rows(sampled_index) result_data.setdefault(year_built, resize(year, sampled_index.size).astype("int32")) ## also add 'independent_variables' to the new dataset for attribute in set(sample_from_dataset.get_primary_attribute_names() + independent_variables): if reset_attribute_value.has_key(attribute): result_data[attribute] = resize(array(reset_attribute_value[attribute]), sampled_index.size) else: result_data[attribute] = sample_from_dataset.get_attribute_by_index(attribute, sampled_index) if id_name and result_data and id_name not in result_data: result_data[id_name] = arange(sampled_index.size, dtype="int32") + 1 storage = StorageFactory().get_storage("dict_storage") storage.write_table(table_name=table_name, table_data=result_data) result_dataset = Dataset( id_name=id_name, in_storage=storage, in_table_name=table_name, dataset_name=dataset_name ) index = arange(result_dataset.size()) if append_to_realestate_dataset: if len(result_data) > 0: index = realestate_dataset.add_elements( result_data, require_all_attributes=False, change_ids_if_not_unique=True ) result_dataset = realestate_dataset return (result_dataset, index)
def run(self, realestate_dataset, year=None, occupied_spaces_variable="occupied_units", total_spaces_variable="total_units", target_attribute_name='target_vacancy_rate', sample_from_dataset = None, sample_filter="", reset_attribute_value={}, year_built = 'year_built', dataset_pool=None, append_to_realestate_dataset = False, table_name = "development_projects", dataset_name = "development_project", id_name = 'development_project_id', **kwargs): """ sample_filter attribute/variable indicates which records in the dataset are eligible in the sampling for removal or cloning append_to_realestate_dataset - whether to append the new dataset to realestate_dataset """ if self.target_vancy_dataset is None: raise RuntimeError, "target_vacancy_rate dataset is unspecified." if not sample_from_dataset: sample_from_dataset = realestate_dataset #if dataset_pool is None: # dataset_pool = SessionConfiguration().get_dataset_pool() alldata = dataset_pool.get_dataset('alldata') unit_names = dataset_pool.get_dataset('building_type').get_attribute('unit_name') sqft_per_job = dataset_pool.get_dataset('building_sqft_per_job') zones = realestate_dataset.compute_variables("building.disaggregate(parcel.zone_id)") type_ids = realestate_dataset.get_attribute("building_type_id") building_sqft_per_job_table = sqft_per_job.get_building_sqft_as_table(zones.max(), type_ids.max()) if year is None: year = SimulationState().get_current_time() this_year_index = where(self.target_vancy_dataset.get_attribute('year')==year)[0] target_vacancy_for_this_year = DatasetSubset(self.target_vancy_dataset, this_year_index) column_names = list(set( self.target_vancy_dataset.get_known_attribute_names() ) - set( [ target_attribute_name, occupied_spaces_variable, total_spaces_variable, 'year', '_hidden_id_'] )) column_names.sort(reverse=True) column_values = dict([ (name, target_vacancy_for_this_year.get_attribute(name)) for name in column_names + [target_attribute_name]]) independent_variables = list(set([re.sub('_max$', '', re.sub('_min$', '', col)) for col in column_names])) dataset_known_attributes = realestate_dataset.get_known_attribute_names() sample_dataset_known_attributes = sample_from_dataset.get_known_attribute_names() for variable in independent_variables: if variable not in dataset_known_attributes: realestate_dataset.compute_one_variable_with_unknown_package(variable, dataset_pool=dataset_pool) if variable not in sample_dataset_known_attributes: sample_from_dataset.compute_one_variable_with_unknown_package(variable, dataset_pool=dataset_pool) dataset_known_attributes = realestate_dataset.get_known_attribute_names() #update after compute if sample_filter: short_name = VariableName(sample_filter).get_alias() if short_name not in dataset_known_attributes: filter_indicator = sample_from_dataset.compute_variables(sample_filter, dataset_pool=dataset_pool) else: filter_indicator = sample_from_dataset.get_attribute(short_name) else: filter_indicator = 1 sampled_index = array([], dtype=int32) #log header if PrettyTable is not None: status_log = PrettyTable() status_log.set_field_names(column_names + ["actual", "target", "difference", "action"]) else: logger.log_status("\t".join(column_names + ["actual", "target", "difference", "action"])) error_log = '' for index in range(target_vacancy_for_this_year.size()): this_sampled_index = array([], dtype=int32) indicator = ones( realestate_dataset.size(), dtype='bool' ) sample_indicator = ones( sample_from_dataset.size(), dtype='bool' ) criterion = {} # for logging for attribute in independent_variables: if attribute in dataset_known_attributes: dataset_attribute = realestate_dataset.get_attribute(attribute) sample_attribute = sample_from_dataset.get_attribute(attribute) else: raise ValueError, "attribute %s used in target vacancy dataset can not be found in dataset %s" % (attribute, realestate_dataset.get_dataset_name()) if attribute + '_min' in column_names: amin = target_vacancy_for_this_year.get_attribute(attribute+'_min')[index] criterion.update({attribute + '_min':amin}) if amin != -1: indicator *= dataset_attribute >= amin sample_indicator *= sample_attribute >= amin if attribute + '_max' in column_names: amax = target_vacancy_for_this_year.get_attribute(attribute+'_max')[index] criterion.update({attribute + '_max':amax}) if amax != -1: indicator *= dataset_attribute <= amax sample_indicator *= sample_attribute <= amax if attribute in column_names: aval = column_values[attribute][index] criterion.update({attribute:aval}) if aval == -1: continue elif aval == -2: ##treat -2 in control totals column as complement set, i.e. all other values not already specified in this column indicator *= logical_not(ismember(dataset_attribute, column_values[attribute])) sample_indicator *= logical_not(ismember(sample_attribute, column_values[attribute])) else: indicator *= dataset_attribute == aval sample_indicator *= sample_attribute == aval this_total_spaces_variable, this_occupied_spaces_variable = total_spaces_variable, occupied_spaces_variable ## total/occupied_spaces_variable can be specified either as a universal name for all realestate ## or in targe_vacancy_rate dataset for each vacancy category if occupied_spaces_variable in target_vacancy_for_this_year.get_known_attribute_names(): this_occupied_spaces_variable = target_vacancy_for_this_year.get_attribute(occupied_spaces_variable)[index] if total_spaces_variable in target_vacancy_for_this_year.get_known_attribute_names(): this_total_spaces_variable = target_vacancy_for_this_year.get_attribute(total_spaces_variable)[index] logger.be_quiet() #temporarily disable logging realestate_dataset.compute_one_variable_with_unknown_package(this_occupied_spaces_variable, dataset_pool=dataset_pool) realestate_dataset.compute_one_variable_with_unknown_package(this_total_spaces_variable, dataset_pool=dataset_pool) sample_from_dataset.compute_one_variable_with_unknown_package(this_total_spaces_variable, dataset_pool=dataset_pool) if unit_names[index]=="residential_units": num_units = alldata.compute_variables("alldata.aggregate_all(household.building_type_id==%s)" % (index+1)) #persons = household_set.compute_variables("%s.number_of_agents(%s)" % (hh_ds_name, person_ds_name), resources=resources) num_units = num_units[0] else: num_units = alldata.compute_variables("alldata.aggregate_all(job.disaggregate(employment_submarket.building_type_id)==%s)" % (index+1)) num_units = num_units * building_sqft_per_job_table[1, (index+1)] num_units = num_units[0] #need to make sure that job empsubmarket doesn't rely on building... #Must do non-home-based jobs only and then multiply by building_sqft logger.talk() actual_num = (indicator * realestate_dataset.get_attribute(this_total_spaces_variable)).sum() #target_num = int(round( (indicator * realestate_dataset.get_attribute(this_occupied_spaces_variable)).sum() /\ target_num = int(round( num_units /\ (1 - target_vacancy_for_this_year.get_attribute(target_attribute_name)[index]) )) diff = target_num - actual_num if diff > 0: total_spaces_in_sample_dataset = sample_from_dataset.get_attribute(this_total_spaces_variable) legit_index = where(logical_and(sample_indicator, filter_indicator) * total_spaces_in_sample_dataset > 0)[0] if legit_index.size > 0: mean_size = total_spaces_in_sample_dataset[legit_index].mean() num_of_projects_to_sample = int( diff / mean_size ) ##sampled at least 1 project when diff > 0, otherwise it is a endless loop when num_of_projects_to_sample = 0 num_of_projects_to_sample = num_of_projects_to_sample if num_of_projects_to_sample > 0 else 1 while total_spaces_in_sample_dataset[this_sampled_index].sum() < diff: lucky_index = sample_replace(legit_index, num_of_projects_to_sample) this_sampled_index = concatenate((this_sampled_index, lucky_index)) this_sampled_index = this_sampled_index[0:(1+searchsorted(cumsum(total_spaces_in_sample_dataset[this_sampled_index]), diff))] sampled_index = concatenate((sampled_index, this_sampled_index)) else: error_log += "There is nothing to sample from %s and no new development will happen for " % sample_from_dataset.get_dataset_name() + \ ','.join([col+"="+str(criterion[col]) for col in column_names]) + '\n' #if diff < 0: #TODO demolition; not yet supported ##log status action = "0" if this_sampled_index.size > 0: action_num = total_spaces_in_sample_dataset[this_sampled_index].sum() if diff > 0: action = "+" + str(action_num) if diff < 0: action = "-" + str(action_num) cat = [ str(criterion[col]) for col in column_names] cat += [str(actual_num), str(target_num), str(diff), action] if PrettyTable is not None: status_log.add_row(cat) else: logger.log_status("\t".join(cat)) if PrettyTable is not None: logger.log_status("\n" + status_log.get_string()) if error_log: logger.log_error(error_log) result_data = {} result_dataset = None index = array([], dtype='int32') if sampled_index.size > 0: ### ideally duplicate_rows() is all needed to add newly cloned rows ### to be more cautious, copy the data to be cloned, remove elements, then append the cloned data ##realestate_dataset.duplicate_rows(sampled_index) result_data.setdefault(year_built, resize(year, sampled_index.size).astype('int32')) ## also add 'independent_variables' to the new dataset for attribute in set(sample_from_dataset.get_primary_attribute_names() + independent_variables): if reset_attribute_value.has_key(attribute): result_data[attribute] = resize(array(reset_attribute_value[attribute]), sampled_index.size) else: result_data[attribute] = sample_from_dataset.get_attribute_by_index(attribute, sampled_index) if id_name and result_data and id_name not in result_data: result_data[id_name] = arange(sampled_index.size, dtype='int32') + 1 storage = StorageFactory().get_storage('dict_storage') storage.write_table(table_name=table_name, table_data=result_data) result_dataset = Dataset(id_name = id_name, in_storage = storage, in_table_name = table_name, dataset_name = dataset_name ) index = arange(result_dataset.size()) if append_to_realestate_dataset: if len(result_data) > 0: index = realestate_dataset.add_elements(result_data, require_all_attributes=False, change_ids_if_not_unique=True) result_dataset = realestate_dataset return (result_dataset, index)
def probsample_noreplace(source_array, sample_size, prob_array=None, exclude_element=None, exclude_index=None, return_index=False, try_replace=True): """generate non-repeat random 1d samples from source_array of sample_size, excluding indices appeared in exclude_index. return indices to source_array if return_index is true. source_array - the source array to sample from sample_size - scalar representing the sample size prob_array - the array used to weight sample exclude_element - array representing elements should not appear in resulted array exclude_index - array representing indices should not appear in resulted array, which can be used, for example, to exclude current choice from sampling, indexed to source_array """ if sample_size <= 0: #logger.log_warning("sample_size is %s. Nothing is sampled." % sample_size) if return_index: return array([], dtype='i') else: return array([], dtype=source_array.dtype) if prob_array is None: return sample_noreplace( source_array, sample_size, return_index=return_index, try_replace=try_replace) else: #make a copy of prob_array so we won't change its original value in the sampling process prob_array2 = prob_array.copy() if exclude_element is not None: prob_array2[ismember(source_array, exclude_element)] = 0.0 if exclude_index is not None: index_range = arange(source_array.size, dtype="i") if isinstance(exclude_index, numpy.ndarray): exclude_index = exclude_index[ismember(exclude_index, index_range)] prob_array2[exclude_index] = 0.0 elif (exclude_index in index_range): prob_array2[exclude_index] = 0.0 nzc = nonzerocounts(prob_array2) if nzc == 0: raise ValueError, "The weight array contains no non-zero elements. Check the weight used for sampling." if nzc < sample_size: if try_replace: logger.log_warning( "The weight array contains %s non-zero elements, less than the sample_size %s. Use probsample_replace. " % (nzc, sample_size)) return probsample_replace( source_array, sample_size, prob_array=prob_array2, return_index=return_index) else: logger.log_warning( "The are %s eligible elements, less than the sample_size %s. Sample %s. " % (nzc, sample_size, nzc)) sample_size = nzc if nzc == sample_size: nonzeroindex = prob_array2.nonzero()[0] if return_index: return nonzeroindex else: return source_array[nonzeroindex] to_be_sampled = sample_size sampled_index = array([], dtype='i') #initialize sampled_index while True: proposed_index = probsample_replace( source_array, to_be_sampled, prob_array2, return_index=True) valid_index = unique(proposed_index, return_index=False) #assert all( logical_not(ismember(valid_index, sampled_index)) ) #valid_index = valid_index[logical_not(ismember(valid_index, sampled_index))] #this should not be necessary because we control the prob_array sampled_index = concatenate((sampled_index, valid_index)) to_be_sampled -= valid_index.size if to_be_sampled == 0: if return_index: return sampled_index else: return source_array[sampled_index] prob_array2[proposed_index] = 0.0 nzc = nonzerocounts(prob_array2) assert nzc > 0 #given that we have checked and made sure nonzerocounts(prob_array2)>size, it should not run out before we have enough non-repeat samples
def run(self, realestate_dataset, living_units_dataset, year=None, occupied_spaces_variable="occupied_units", total_spaces_variable="total_units", target_attribute_name='target_vacancy_rate', sample_from_dataset = None, living_units_from_dataset = None, sample_filter="", reset_attribute_value={}, year_built = 'year_built', dataset_pool=None, append_to_realestate_dataset = False, table_name = "development_projects", dataset_name = "development_project", id_name = 'development_project_id', **kwargs): """ sample_filter attribute/variable indicates which records in the dataset are eligible in the sampling for removal or cloning append_to_realestate_dataset - whether to append the new dataset to realestate_dataset """ if self.target_vancy_dataset is None: raise RuntimeError, "target_vacancy_rate dataset is unspecified." if not sample_from_dataset or not living_units_from_dataset: logger.log_note('No development projects or no living units of development projects to sample from. Development projects are taken from building dataset and thus living units from living_units dataset.') sample_from_dataset = realestate_dataset living_units_from_dataset = living_units_dataset if dataset_pool is None: dataset_pool = SessionConfiguration().get_dataset_pool() if year is None: year = SimulationState().get_current_time() this_year_index = where(self.target_vancy_dataset.get_attribute('year')==year)[0] target_vacancy_for_this_year = DatasetSubset(self.target_vancy_dataset, this_year_index) column_names = list(set( self.target_vancy_dataset.get_known_attribute_names() ) - set( [ target_attribute_name, occupied_spaces_variable, total_spaces_variable, 'year', '_hidden_id_'] )) column_names.sort(reverse=True) column_values = dict([ (name, target_vacancy_for_this_year.get_attribute(name)) for name in column_names + [target_attribute_name]]) independent_variables = list(set([re.sub('_max$', '', re.sub('_min$', '', col)) for col in column_names])) sample_dataset_known_attributes = sample_from_dataset.get_known_attribute_names() for attribute in independent_variables: if attribute not in sample_dataset_known_attributes: sample_from_dataset.compute_one_variable_with_unknown_package(attribute, dataset_pool=dataset_pool) sample_dataset_known_attributes = sample_from_dataset.get_known_attribute_names() #update after compute if sample_filter: short_name = VariableName(sample_filter).get_alias() if short_name not in sample_dataset_known_attributes: filter_indicator = sample_from_dataset.compute_variables(sample_filter, dataset_pool=dataset_pool) else: filter_indicator = sample_from_dataset.get_attribute(short_name) else: filter_indicator = 1 sampled_index = array([], dtype=int32) #log header if PrettyTable is not None: status_log = PrettyTable() status_log.set_field_names(column_names + ["actual", "target", "expected", "difference", "action"]) else: logger.log_status("\t".join(column_names + ["actual", "target", "expected", "difference", "action"])) error_log = '' for index in range(target_vacancy_for_this_year.size()): sample_indicator = ones( sample_from_dataset.size(), dtype='bool' ) criterion = {} # for logging for attribute in independent_variables: if attribute in sample_dataset_known_attributes: sample_attribute = sample_from_dataset.get_attribute(attribute) else: raise ValueError, "attribute %s used in target vacancy dataset can not be found in dataset %s" % (attribute, realestate_dataset.get_dataset_name()) if attribute + '_min' in column_names: amin = target_vacancy_for_this_year.get_attribute(attribute+'_min')[index] criterion.update({attribute + '_min':amin}) if amin != -1: sample_indicator *= sample_attribute >= amin if attribute + '_max' in column_names: amax = target_vacancy_for_this_year.get_attribute(attribute+'_max')[index] criterion.update({attribute + '_max':amax}) if amax != -1: sample_indicator *= sample_attribute <= amax if attribute in column_names: aval = column_values[attribute][index] criterion.update({attribute:aval}) if aval == -1: continue elif aval == -2: ##treat -2 in control totals column as complement set, i.e. all other values not already specified in this column sample_indicator *= logical_not(ismember(sample_attribute, column_values[attribute])) else: sample_indicator *= sample_attribute == aval this_total_spaces_variable, this_occupied_spaces_variable = total_spaces_variable, occupied_spaces_variable ## total/occupied_spaces_variable can be specified either as a universal name for all realestate ## or in targe_vacancy_rate dataset for each vacancy category if occupied_spaces_variable in target_vacancy_for_this_year.get_known_attribute_names(): this_occupied_spaces_variable = target_vacancy_for_this_year.get_attribute(occupied_spaces_variable)[index] if total_spaces_variable in target_vacancy_for_this_year.get_known_attribute_names(): this_total_spaces_variable = target_vacancy_for_this_year.get_attribute(total_spaces_variable)[index] this_total_spaces_variable += '_' + str(criterion[col]) this_occupied_spaces_variable += '_' + str(criterion[col]) logger.be_quiet() #temporarily disable logging realestate_dataset.compute_one_variable_with_unknown_package(this_occupied_spaces_variable, dataset_pool=dataset_pool) realestate_dataset.compute_one_variable_with_unknown_package(this_total_spaces_variable, dataset_pool=dataset_pool) sample_from_dataset.compute_one_variable_with_unknown_package(this_total_spaces_variable, dataset_pool=dataset_pool) logger.talk() actual_num = (realestate_dataset.get_attribute(this_total_spaces_variable)).sum() #target_num is obsolete with this version. target_num = int(round( (realestate_dataset.get_attribute(this_occupied_spaces_variable)).sum() /\ (1 - target_vacancy_for_this_year.get_attribute(target_attribute_name)[index]))) '''If the target vacancy is very small and the inflow to the region big it is not enough to check only the current simulation year's vacancy. The simulation is more robust if the BTM is anticipating the next year's population (of households and jobs). This version calculates the non residential spaces based on sqft requirements of jobs per sector. #TODO: Make code more general to cover various stratifications in the real estate market. ''' if criterion[col] == 0: """ Option without demography model idx = where(self.control_totals.get_attribute("year")==year + 1)[0] this_years_control_totals = DatasetSubset(self.control_totals, idx) expected_num = int(round( this_years_control_totals.get_attribute('total_number_of_households').sum() /\ (1 - target_vacancy_for_this_year.get_attribute(target_attribute_name)[index])))""" hh_dataset = dataset_pool.get_dataset( 'household' ) number_of_hh = hh_dataset.size() expected_num = int(round( number_of_hh /\ (1 - target_vacancy_for_this_year.get_attribute(target_attribute_name)[index]))) if criterion[col] > 0: # Getting control totals per sector in a dictionary idx = where(self.employment_control_totals.get_attribute("year")==year)[0] # Create index to get the subset of control totals for the next simulation year. this_years_control_totals = DatasetSubset(self.employment_control_totals, idx) # Create the subset of control totals. idx_non_home_based = where(logical_and(this_years_control_totals['home_based_status'] == 0,this_years_control_totals['sector_id'] == criterion[col]))[0] # Create index of non home based control totals in current sector. Only non home based jobs are supported. TODO: Support home based jobs. this_years_control_totals = DatasetSubset(this_years_control_totals, idx_non_home_based) # idx_current_sector = where(this_years_control_totals['sector_id'] == criterion[col])[0] next_years_jobs = this_years_control_totals['number_of_jobs'] controled_sectors = this_years_control_totals['sector_id'] sector_job_totals = dict(zip(controled_sectors, next_years_jobs.T)) # creating dictionary with sector id's as key and number of jobs as values to ensure multiplication with right requiremtents. # Getting infos on required sqft per sector. # a_zone_id = min(self.building_sqft_per_job['zone_id']) # Get a zone number from the definition table. Here choose to take the minimum which is arbitrary. This code assumes constant sqft requirements in all zones. TODO: Support different sqft requirements per zone. # idx_zone = where(self.building_sqft_per_job['zone_id'] == a_zone_id)[0] # subset_sqft_per_job = DatasetSubset(self.building_sqft_per_job, idx_zone) # sqft_per_job = subset_sqft_per_job['building_sqft_per_job'] # sectors_with_requirements = subset_sqft_per_job['sector_id'] # requirements_by_sector = dict(zip(sectors_with_requirements, sqft_per_job.T)) # # needed_sqft_over_all_sectors = sector_job_totals[criterion[col]] * requirements_by_sector[criterion[col]] # expected_num = int(round( needed_sqft_over_all_sectors /\ # (1 - target_vacancy_for_this_year.get_attribute(target_attribute_name)[index]))) idx_sector = where(self.sectors['sector_id'] == criterion[col]) subset_sqft_per_job_sector = DatasetSubset(self.sectors, idx_sector) needed_sqft_current_sector = sector_job_totals[criterion[col]] * subset_sqft_per_job_sector.get_attribute('sqm_per_job') expected_num = int(round( needed_sqft_current_sector /\ (1 - target_vacancy_for_this_year.get_attribute(target_attribute_name)[index]))) diff = expected_num - actual_num #Previous version which is checking the current years occupation. #diff = target_num - actual_num this_sampled_index = array([], dtype=int32) if diff > 0: total_spaces_in_sample_dataset = sample_from_dataset.get_attribute(this_total_spaces_variable) legit_index = where(logical_and(sample_indicator, filter_indicator) * total_spaces_in_sample_dataset > 0)[0] if legit_index.size > 0: mean_size = total_spaces_in_sample_dataset[legit_index].mean() num_of_projects_to_sample = int( diff / mean_size ) ##sampled at least 1 project when diff > 0, otherwise it is a endless loop when num_of_projects_to_sample = 0 num_of_projects_to_sample = num_of_projects_to_sample if num_of_projects_to_sample > 0 else 1 while total_spaces_in_sample_dataset[this_sampled_index].sum() < diff: lucky_index = sample_replace(legit_index, num_of_projects_to_sample) this_sampled_index = concatenate((this_sampled_index, lucky_index)) this_sampled_index = this_sampled_index[0:(1+searchsorted(cumsum(total_spaces_in_sample_dataset[this_sampled_index]), diff))] sampled_index = concatenate((sampled_index, this_sampled_index)) else: error_log += "There is nothing to sample from %s and no new development will happen for " % sample_from_dataset.get_dataset_name() + \ ','.join([col+"="+str(criterion[col]) for col in column_names]) + '\n' #if diff < 0: #TODO demolition; not yet supported ##log status action = "0" if this_sampled_index.size > 0: action_num = total_spaces_in_sample_dataset[this_sampled_index].sum() if diff > 0: action = "+" + str(action_num) if diff < 0: action = "-" + str(action_num) cat = [ str(criterion[col]) for col in column_names] cat += [str(actual_num), str(target_num), str(expected_num), str(diff), action] if PrettyTable is not None: status_log.add_row(cat) else: logger.log_status("\t".join(cat)) if PrettyTable is not None: logger.log_status("\n" + status_log.get_string()) if error_log: logger.log_error(error_log) #logger.log_note("Updating attributes of %s sampled development events." % sampled_index.size) result_data = {} result_dataset = None index = array([], dtype='int32') if sampled_index.size > 0: ### ideally duplicate_rows() is all needed to add newly cloned rows ### to be more cautious, copy the data to be cloned, remove elements, then append the cloned data ##realestate_dataset.duplicate_rows(sampled_index) #result_data.setdefault(year_built, resize(year, sampled_index.size).astype('int32')) # Reset the year_built attribute. Uncommented because it is overwritten in the for loop afterwards. ## also add 'independent_variables' to the new dataset for attribute in set(sample_from_dataset.get_primary_attribute_names() + independent_variables): if reset_attribute_value.has_key(attribute): result_data[attribute] = resize(array(reset_attribute_value[attribute]), sampled_index.size) else: result_data[attribute] = sample_from_dataset.get_attribute_by_index(attribute, sampled_index) # Reset the year_built attribute. result_data['year_built'] = resize(year, sampled_index.size).astype('int32') # TODO: Uncomment the following three lines to reset land_area, tax_exempt, zgde. Test still to be done. parcel_id should be changed by location choice model. #result_data['land_area'] = resize(-1, sampled_index.size).astype('int32') #result_data['tax_exempt'] = resize(-1, sampled_index.size).astype('int32') #result_data['zgde'] = resize(-1, sampled_index.size).astype('int32') if id_name and result_data and id_name not in result_data: result_data[id_name] = arange(sampled_index.size, dtype='int32') + 1 storage = StorageFactory().get_storage('dict_storage') storage.write_table(table_name=table_name, table_data=result_data) result_dataset = Dataset(id_name = id_name, in_storage = storage, in_table_name = table_name, dataset_name = dataset_name ) index = arange(result_dataset.size()) if append_to_realestate_dataset: if len(result_data) > 0: logger.start_block('Appending development events and living units') logger.log_note("Append %d sampled development events to real estate dataset." % len(result_data[result_data.keys()[0]])) index = realestate_dataset.add_elements(result_data, require_all_attributes=False, change_ids_if_not_unique=True) logger.start_block('Creating id mapping') # remember the ids from the development_event_history dataset. mapping_new_old = self.get_mapping_of_old_ids_to_new_ids(result_data, realestate_dataset, index) logger.end_block() '''Getting living units associated to selected development events by iterating over the mapping dictionary and selecting each time all the living units according to the old building ids. The living units are then added to selected_living_units_dict which is then added to living_units dataset. A dictionary is needed to use the add_elements method. Creating a dictionary also clones the records. The subset is only a view on the original table.''' selected_living_units_dict = {} counter = 0 for new_id in mapping_new_old: if counter == 0: logger.log_note("Log assignment of every 100th development event") counter +=1 if counter % 100 == 0: logger.log_note("Assembling living units for development event %s" % new_id) sel_index = [i for i in range(0, len(living_units_from_dataset['building_id'])) if living_units_from_dataset['building_id'][i] == mapping_new_old[new_id]] living_units_this_sampled_building = DatasetSubset(living_units_from_dataset, sel_index) if len(selected_living_units_dict) == 0: logger.start_block('Assign new building id') for attribute_name in living_units_this_sampled_building.get_primary_attribute_names(): column = living_units_this_sampled_building.get_attribute(attribute_name) if attribute_name == 'building_id': new_ids = array(living_units_this_sampled_building.size() * [new_id], dtype=int32) selected_living_units_dict.update({attribute_name: new_ids}) else: selected_living_units_dict.update({attribute_name: column}) logger.end_block() else: this_living_units_dict ={} for attribute_name in living_units_this_sampled_building.get_primary_attribute_names(): column = living_units_this_sampled_building.get_attribute(attribute_name) if attribute_name == 'building_id': new_ids = array(living_units_this_sampled_building.size() * [new_id], dtype=int32) this_living_units_dict.update({attribute_name: new_ids}) else: this_living_units_dict.update({attribute_name: column}) for attribute_name in living_units_this_sampled_building.get_primary_attribute_names(): selected_living_units_dict[attribute_name] = concatenate([selected_living_units_dict[attribute_name], this_living_units_dict[attribute_name]]) # Reset year_built attribute of living units selected_living_units_dict['year_built'] = resize(year, len(selected_living_units_dict['year_built'])).astype('int32') # TODO: Uncomment the following two lines to reset rent_price, zgde. Test still to be done # selected_living_units_dict['rent_price'] = resize(-1, len(selected_living_units_dict['rent_price'])).astype('int32') # selected_living_units_dict['zgde'] = resize(-1, len(selected_living_units_dict['zgde'])).astype('int32') index_units = living_units_dataset.add_elements(selected_living_units_dict, require_all_attributes=False, change_ids_if_not_unique=True) # Check consistency of buildings and living units. All living units must belong to a building if SimulationState().get_current_time() - SimulationState().get_start_time() == 1: for building_id in living_units_dataset['building_id']: if building_id not in realestate_dataset['building_id']: logger.log_warning('Living unit with building_id %d has no corresponding building.' % (building_id)) # Uncomment next line to enforce consistency of living units and building dataset. Then you may uncomment the two previous lines. # assert(building_id in realestate_dataset['building_id']), 'Living unit with building_id %d has no corresponding building.' % (building_id) result_dataset = realestate_dataset logger.end_block() # It is recommended to derive all variables of buildings in relation to living units via expression variables. # However, if the building dataset contains attributes derived from living units these attributes should be consistent # with the living units table. Below an example. # Residential_units attribute of each building should be consistent with the number of living units associated. # self.check_consistency_of_living_units_per_building(realestate_dataset, living_units_dataset, mapping_new_old) return (result_dataset, index)
def probsample_noreplace(source_array, sample_size, prob_array=None, exclude_element=None, exclude_index=None, return_index=False, try_replace=True): """generate non-repeat random 1d samples from source_array of sample_size, excluding indices appeared in exclude_index. return indices to source_array if return_index is true. source_array - the source array to sample from sample_size - scalar representing the sample size prob_array - the array used to weight sample exclude_element - array representing elements should not appear in resulted array exclude_index - array representing indices should not appear in resulted array, which can be used, for example, to exclude current choice from sampling, indexed to source_array """ if sample_size <= 0: #logger.log_warning("sample_size is %s. Nothing is sampled." % sample_size) if return_index: return array([], dtype='i') else: return array([], dtype=source_array.dtype) if prob_array is None: return sample_noreplace(source_array, sample_size, return_index=return_index, try_replace=try_replace) else: #make a copy of prob_array so we won't change its original value in the sampling process prob_array2 = prob_array.copy() if exclude_element is not None: prob_array2[ismember(source_array, exclude_element)] = 0.0 if exclude_index is not None: index_range = arange(source_array.size, dtype="i") if isinstance(exclude_index, numpy.ndarray): exclude_index = exclude_index[ismember(exclude_index, index_range)] prob_array2[exclude_index] = 0.0 elif (exclude_index in index_range): prob_array2[exclude_index] = 0.0 nzc = nonzerocounts(prob_array2) if nzc == 0: raise ValueError, "The weight array contains no non-zero elements. Check the weight used for sampling." if nzc < sample_size: if try_replace: logger.log_warning("The weight array contains %s non-zero elements, less than the sample_size %s. Use probsample_replace. " % (nzc, sample_size)) return probsample_replace(source_array, sample_size, prob_array=prob_array2, return_index=return_index) else: logger.log_warning("The are %s eligible elements, less than the sample_size %s. Sample %s. " % (nzc, sample_size, nzc)) sample_size = nzc if nzc == sample_size: nonzeroindex = prob_array2.nonzero()[0] if return_index: return nonzeroindex else: return source_array[nonzeroindex] to_be_sampled = sample_size sampled_index = array([], dtype='i') #initialize sampled_index while True: proposed_index = probsample_replace(source_array, to_be_sampled, prob_array2, return_index=True) valid_index = unique(proposed_index, return_index=False) #assert all( logical_not(ismember(valid_index, sampled_index)) ) #valid_index = valid_index[logical_not(ismember(valid_index, sampled_index))] #this should not be necessary because we control the prob_array sampled_index = concatenate((sampled_index, valid_index)) to_be_sampled -= valid_index.size if to_be_sampled == 0: if return_index: return sampled_index else: return source_array[sampled_index] prob_array2[proposed_index] = 0.0 nzc = nonzerocounts(prob_array2) assert nzc > 0 #given that we have checked and made sure nonzerocounts(prob_array2)>size, it should not run out before we have enough non-repeat samples
def run(self, realestate_dataset, year=None, occupied_spaces_variable="occupied_units", total_spaces_variable="total_units", target_attribute_name='target_vacancy_rate', sample_from_dataset = None, sample_filter="", reset_attribute_value={}, year_built = 'year_built', dataset_pool=None, append_to_realestate_dataset = False, table_name = "development_projects", dataset_name = "development_project", id_name = [], **kwargs): """ sample_filter attribute/variable indicates which records in the dataset are eligible in the sampling for removal or cloning append_to_realestate_dataset - whether to append the new dataset to realestate_dataset """ if self.target_vancy_dataset is None: raise RuntimeError, "target_vacancy_rate dataset is unspecified." if not sample_from_dataset: sample_from_dataset = realestate_dataset #if dataset_pool is None: # dataset_pool = SessionConfiguration().get_dataset_pool() if year is None: year = SimulationState().get_current_time() this_year_index = where(self.target_vancy_dataset.get_attribute('year')==year)[0] target_vacancy_for_this_year = DatasetSubset(self.target_vancy_dataset, this_year_index) column_names = list(set( self.target_vancy_dataset.get_known_attribute_names() ) - set( [ target_attribute_name, occupied_spaces_variable, total_spaces_variable, 'year', '_hidden_id_'] )) column_names.sort(reverse=True) column_values = dict([ (name, target_vacancy_for_this_year.get_attribute(name)) for name in column_names + [target_attribute_name]]) independent_variables = list(set([re.sub('_max$', '', re.sub('_min$', '', col)) for col in column_names])) dataset_known_attributes = realestate_dataset.get_known_attribute_names() for variable in independent_variables: if variable not in dataset_known_attributes: realestate_dataset.compute_one_variable_with_unknown_package(variable, dataset_pool=dataset_pool) sample_from_dataset.compute_one_variable_with_unknown_package(variable, dataset_pool=dataset_pool) dataset_known_attributes = realestate_dataset.get_known_attribute_names() #update after compute if sample_filter: short_name = VariableName(sample_filter).get_alias() if short_name not in dataset_known_attributes: filter_indicator = sample_from_dataset.compute_variables(sample_filter, dataset_pool=dataset_pool) else: filter_indicator = sample_from_dataset.get_attribute(short_name) else: filter_indicator = 1 sampled_index = array([], dtype=int32) #log header if PrettyTable is not None: status_log = PrettyTable() status_log.set_field_names(column_names + ["actual", "target", "difference", "action"]) else: logger.log_status("\t".join(column_names + ["actual", "target", "difference", "action"])) error_log = '' for index in range(target_vacancy_for_this_year.size()): this_sampled_index = array([], dtype=int32) indicator = ones( realestate_dataset.size(), dtype='bool' ) sample_indicator = ones( sample_from_dataset.size(), dtype='bool' ) criterion = {} # for logging for attribute in independent_variables: if attribute in dataset_known_attributes: dataset_attribute = realestate_dataset.get_attribute(attribute) sample_attribute = sample_from_dataset.get_attribute(attribute) else: raise ValueError, "attribute %s used in target vacancy dataset can not be found in dataset %s" % (attribute, realestate_dataset.get_dataset_name()) if attribute + '_min' in column_names: amin = target_vacancy_for_this_year.get_attribute(attribute+'_min')[index] criterion.update({attribute + '_min':amin}) if amin != -1: indicator *= dataset_attribute >= amin sample_indicator *= sample_attribute >= amin if attribute + '_max' in column_names: amax = target_vacancy_for_this_year.get_attribute(attribute+'_max')[index] criterion.update({attribute + '_max':amax}) if amax != -1: indicator *= dataset_attribute <= amax sample_indicator *= sample_attribute <= amax if attribute in column_names: aval = column_values[attribute][index] criterion.update({attribute:aval}) if aval == -1: continue elif aval == -2: ##treat -2 in control totals column as complement set, i.e. all other values not already specified in this column indicator *= logical_not(ismember(dataset_attribute, column_values[attribute])) sample_indicator *= logical_not(ismember(sample_attribute, column_values[attribute])) else: indicator *= dataset_attribute == aval sample_indicator *= sample_attribute == aval this_total_spaces_variable, this_occupied_spaces_variable = total_spaces_variable, occupied_spaces_variable ## total/occupied_spaces_variable can be specified either as a universal name for all realestate ## or in targe_vacancy_rate dataset for each vacancy category if occupied_spaces_variable in target_vacancy_for_this_year.get_known_attribute_names(): this_occupied_spaces_variable = target_vacancy_for_this_year.get_attribute(occupied_spaces_variable)[index] if total_spaces_variable in target_vacancy_for_this_year.get_known_attribute_names(): this_total_spaces_variable = target_vacancy_for_this_year.get_attribute(total_spaces_variable)[index] logger.be_quiet() #temporarily disable logging realestate_dataset.compute_one_variable_with_unknown_package(this_occupied_spaces_variable, dataset_pool=dataset_pool) realestate_dataset.compute_one_variable_with_unknown_package(this_total_spaces_variable, dataset_pool=dataset_pool) sample_from_dataset.compute_one_variable_with_unknown_package(this_total_spaces_variable, dataset_pool=dataset_pool) logger.talk() actual_num = (indicator * realestate_dataset.get_attribute(this_total_spaces_variable)).sum() target_num = int(round( (indicator * realestate_dataset.get_attribute(this_occupied_spaces_variable)).sum() /\ (1 - target_vacancy_for_this_year.get_attribute(target_attribute_name)[index]) )) diff = target_num - actual_num if diff > 0: total_spaces_in_sample_dataset = sample_from_dataset.get_attribute(this_total_spaces_variable) legit_index = where(logical_and(sample_indicator, filter_indicator) * total_spaces_in_sample_dataset > 0)[0] if legit_index.size > 0: mean_size = total_spaces_in_sample_dataset[legit_index].mean() num_of_projects_to_sample = int( diff / mean_size ) while total_spaces_in_sample_dataset[this_sampled_index].sum() < diff: lucky_index = sample_replace(legit_index, num_of_projects_to_sample) this_sampled_index = concatenate((this_sampled_index, lucky_index)) this_sampled_index = this_sampled_index[0:(1+searchsorted(cumsum(total_spaces_in_sample_dataset[this_sampled_index]), diff))] sampled_index = concatenate((sampled_index, this_sampled_index)) else: error_log += "There is nothing to sample from %s and no new development will happen for " % sample_from_dataset.get_dataset_name() + \ ','.join([col+"="+str(criterion[col]) for col in column_names]) + '\n' #if diff < 0: #TODO demolition; not yet supported ##log status action = "0" if this_sampled_index.size > 0: action_num = total_spaces_in_sample_dataset[this_sampled_index].sum() if diff > 0: action = "+" + str(action_num) if diff < 0: action = "-" + str(action_num) cat = [ str(criterion[col]) for col in column_names] cat += [str(actual_num), str(target_num), str(diff), action] if PrettyTable is not None: status_log.add_row(cat) else: logger.log_status("\t".join(cat)) if PrettyTable is not None: logger.log_status("\n" + status_log.get_string()) if error_log: logger.log_error(error_log) result_data = {} result_dataset = None index = array([], dtype='int32') if sampled_index.size > 0: ### ideally duplicate_rows() is all needed to add newly cloned rows ### to be more cautious, copy the data to be cloned, remove elements, then append the cloned data ##realestate_dataset.duplicate_rows(sampled_index) result_data.setdefault(year_built, resize(year, sampled_index.size).astype('int32')) for attribute in sample_from_dataset.get_primary_attribute_names(): if reset_attribute_value.has_key(attribute): result_data[attribute] = resize(array(reset_attribute_value[attribute]), sampled_index.size) else: result_data[attribute] = sample_from_dataset.get_attribute_by_index(attribute, sampled_index) storage = StorageFactory().get_storage('dict_storage') storage.write_table(table_name=table_name, table_data=result_data) result_dataset = Dataset(id_name = id_name, in_storage = storage, in_table_name = table_name, dataset_name = dataset_name ) index = arange(result_dataset.size()) if append_to_realestate_dataset: if len(result_data) > 0: index = realestate_dataset.add_elements(result_data, require_all_attributes=False, change_ids_if_not_unique=True) result_dataset = realestate_dataset return (result_dataset, index)