def create_prediction_success_table(self, summarize_by=None, predicted_choice_id_name=None, predicted_choice_id_prefix="predicted_", log_to_file=None, force_predict=True): agents = self.get_agent_set() choices = self.get_choice_set() choice_id_name = choices.get_id_name()[0] if self.agents_index_for_prediction is not None: agents_index = self.agents_index_for_prediction else: agents_index = self.get_agent_set_index() if predicted_choice_id_name is None or len(predicted_choice_id_name) == 0: predicted_choice_id_name = predicted_choice_id_prefix + choice_id_name if force_predict or (predicted_choice_id_name not in agents.get_known_attribute_names()): if not self.predict(predicted_choice_id_name=predicted_choice_id_name, agents_index=agents_index ): logger.log_error("Failed to run simulation for prediction; unable to create prediction success table.") return if log_to_file is not None and len(log_to_file) > 0: logger.enable_file_logging(log_to_file) ## by default, compare predicted choice with observed choice ## this is not feasible for location choice model, where the ## alternative set is too large to be useful if summarize_by is None: summarize_by = "%s.%s" % (agents.dataset_name, choice_id_name) summarize_dataset_name = VariableName(summarize_by).get_dataset_name() if summarize_dataset_name == choices.dataset_name: summary_id = choices.compute_variables(summarize_by) chosen_choice_id = agents.get_attribute_by_index(choices.get_id_name()[0], agents_index) predicted_choice_id = agents.get_attribute_by_index(predicted_choice_id_name, agents_index) chosen_choice_index = choices.get_id_index(chosen_choice_id) predicted_choice_index = choices.try_get_id_index(predicted_choice_id) chosen_summary_id = summary_id[chosen_choice_index] predicted_summary_id = summary_id[predicted_choice_index] unique_summary_id = unique(summary_id) elif summarize_dataset_name == agents.dataset_name: chosen_summary_id = agents.compute_variables(summarize_by)[agents_index] chosen_choice_id = agents.get_attribute(choice_id_name).copy() predicted_choice_id = agents.get_attribute(predicted_choice_id_name) agents.modify_attribute(name=choice_id_name, data=predicted_choice_id) predicted_summary_id = agents.compute_variables(summarize_by)[agents_index] agents.modify_attribute(name=choice_id_name, data=chosen_choice_id) unique_summary_id = unique( concatenate((chosen_summary_id, predicted_summary_id)) ) else: logger.log_error("summarize_by expression '%s' is specified for dataset %s, which is neither the choice_set '%s' nor the agent_set '%s'." % (summarize_by, summarize_dataset_name, choices.dataset_name, agents.dataset_name)) return False unique_nonneg_summary_id = unique_summary_id[unique_summary_id >= 0] # observed on row, predicted on column prediction_matrix = zeros( (unique_nonneg_summary_id.size, unique_nonneg_summary_id.size), dtype="int32" ) def _convert_array_to_tab_delimited_string(an_array): from numpy import dtype if an_array.dtype == dtype('f'): return "\t".join(["%5.4f" % item for item in an_array]) return "\t".join([str(item) for item in an_array]) logger.log_status("Observed_id\tSuccess_rate\t%s" % \ _convert_array_to_tab_delimited_string(unique_nonneg_summary_id) ) i = 0 total_correct = 0 success_rate = zeros( unique_nonneg_summary_id.size, dtype="float32" ) for observed_id in unique_nonneg_summary_id: predicted_id = predicted_summary_id[chosen_summary_id==observed_id] prediction_matrix[i] = ndimage.sum(ones(predicted_id.size), labels=predicted_id, index=unique_nonneg_summary_id ) if prediction_matrix[i].sum() > 0: if prediction_matrix[i].sum() > 0: success_rate[i] = float(prediction_matrix[i, i]) / prediction_matrix[i].sum() total_correct = total_correct + prediction_matrix[i, i] else: success_rate[i] = 0 logger.log_status("%s\t\t%5.4f\t\t%s" % (observed_id, success_rate[i], _convert_array_to_tab_delimited_string(prediction_matrix[i]) ) ) i+=1 success_rate2 = zeros( i, dtype="float32" ) for j in range(i): if prediction_matrix[j, :].sum() > 0: success_rate2[j]=float(prediction_matrix[:,j].sum()) / prediction_matrix[j, :].sum() else: success_rate2[j]=0 logger.log_status("%s\t\t%s\t\t%s" % (' ', ' ', _convert_array_to_tab_delimited_string( success_rate2 ) )) logger.log_status("\nTotal success rate: %5.4f" % (total_correct/float(prediction_matrix.sum()))) logger.disable_file_logging(filename=log_to_file)
def run(self, realestate_dataset, year=None, occupied_spaces_variable="occupied_units", total_spaces_variable="total_units", target_attribute_name='target_vacancy_rate', sample_from_dataset = None, sample_filter="", reset_attribute_value={}, year_built = 'year_built', dataset_pool=None, append_to_realestate_dataset = False, table_name = "development_projects", dataset_name = "development_project", id_name = [], **kwargs): """ sample_filter attribute/variable indicates which records in the dataset are eligible in the sampling for removal or cloning append_to_realestate_dataset - whether to append the new dataset to realestate_dataset """ if self.target_vancy_dataset is None: raise RuntimeError, "target_vacancy_rate dataset is unspecified." if not sample_from_dataset: sample_from_dataset = realestate_dataset #if dataset_pool is None: # dataset_pool = SessionConfiguration().get_dataset_pool() if year is None: year = SimulationState().get_current_time() this_year_index = where(self.target_vancy_dataset.get_attribute('year')==year)[0] target_vacancy_for_this_year = DatasetSubset(self.target_vancy_dataset, this_year_index) column_names = list(set( self.target_vancy_dataset.get_known_attribute_names() ) - set( [ target_attribute_name, occupied_spaces_variable, total_spaces_variable, 'year', '_hidden_id_'] )) column_names.sort(reverse=True) column_values = dict([ (name, target_vacancy_for_this_year.get_attribute(name)) for name in column_names + [target_attribute_name]]) independent_variables = list(set([re.sub('_max$', '', re.sub('_min$', '', col)) for col in column_names])) dataset_known_attributes = realestate_dataset.get_known_attribute_names() for variable in independent_variables: if variable not in dataset_known_attributes: realestate_dataset.compute_one_variable_with_unknown_package(variable, dataset_pool=dataset_pool) sample_from_dataset.compute_one_variable_with_unknown_package(variable, dataset_pool=dataset_pool) dataset_known_attributes = realestate_dataset.get_known_attribute_names() #update after compute if sample_filter: short_name = VariableName(sample_filter).get_alias() if short_name not in dataset_known_attributes: filter_indicator = sample_from_dataset.compute_variables(sample_filter, dataset_pool=dataset_pool) else: filter_indicator = sample_from_dataset.get_attribute(short_name) else: filter_indicator = 1 sampled_index = array([], dtype=int32) #log header if PrettyTable is not None: status_log = PrettyTable() status_log.set_field_names(column_names + ["actual", "target", "difference", "action"]) else: logger.log_status("\t".join(column_names + ["actual", "target", "difference", "action"])) error_log = '' for index in range(target_vacancy_for_this_year.size()): this_sampled_index = array([], dtype=int32) indicator = ones( realestate_dataset.size(), dtype='bool' ) sample_indicator = ones( sample_from_dataset.size(), dtype='bool' ) criterion = {} # for logging for attribute in independent_variables: if attribute in dataset_known_attributes: dataset_attribute = realestate_dataset.get_attribute(attribute) sample_attribute = sample_from_dataset.get_attribute(attribute) else: raise ValueError, "attribute %s used in target vacancy dataset can not be found in dataset %s" % (attribute, realestate_dataset.get_dataset_name()) if attribute + '_min' in column_names: amin = target_vacancy_for_this_year.get_attribute(attribute+'_min')[index] criterion.update({attribute + '_min':amin}) if amin != -1: indicator *= dataset_attribute >= amin sample_indicator *= sample_attribute >= amin if attribute + '_max' in column_names: amax = target_vacancy_for_this_year.get_attribute(attribute+'_max')[index] criterion.update({attribute + '_max':amax}) if amax != -1: indicator *= dataset_attribute <= amax sample_indicator *= sample_attribute <= amax if attribute in column_names: aval = column_values[attribute][index] criterion.update({attribute:aval}) if aval == -1: continue elif aval == -2: ##treat -2 in control totals column as complement set, i.e. all other values not already specified in this column indicator *= logical_not(ismember(dataset_attribute, column_values[attribute])) sample_indicator *= logical_not(ismember(sample_attribute, column_values[attribute])) else: indicator *= dataset_attribute == aval sample_indicator *= sample_attribute == aval this_total_spaces_variable, this_occupied_spaces_variable = total_spaces_variable, occupied_spaces_variable ## total/occupied_spaces_variable can be specified either as a universal name for all realestate ## or in targe_vacancy_rate dataset for each vacancy category if occupied_spaces_variable in target_vacancy_for_this_year.get_known_attribute_names(): this_occupied_spaces_variable = target_vacancy_for_this_year.get_attribute(occupied_spaces_variable)[index] if total_spaces_variable in target_vacancy_for_this_year.get_known_attribute_names(): this_total_spaces_variable = target_vacancy_for_this_year.get_attribute(total_spaces_variable)[index] logger.be_quiet() #temporarily disable logging realestate_dataset.compute_one_variable_with_unknown_package(this_occupied_spaces_variable, dataset_pool=dataset_pool) realestate_dataset.compute_one_variable_with_unknown_package(this_total_spaces_variable, dataset_pool=dataset_pool) sample_from_dataset.compute_one_variable_with_unknown_package(this_total_spaces_variable, dataset_pool=dataset_pool) logger.talk() actual_num = (indicator * realestate_dataset.get_attribute(this_total_spaces_variable)).sum() target_num = int(round( (indicator * realestate_dataset.get_attribute(this_occupied_spaces_variable)).sum() /\ (1 - target_vacancy_for_this_year.get_attribute(target_attribute_name)[index]) )) diff = target_num - actual_num if diff > 0: total_spaces_in_sample_dataset = sample_from_dataset.get_attribute(this_total_spaces_variable) legit_index = where(logical_and(sample_indicator, filter_indicator) * total_spaces_in_sample_dataset > 0)[0] if legit_index.size > 0: mean_size = total_spaces_in_sample_dataset[legit_index].mean() num_of_projects_to_sample = int( diff / mean_size ) while total_spaces_in_sample_dataset[this_sampled_index].sum() < diff: lucky_index = sample_replace(legit_index, num_of_projects_to_sample) this_sampled_index = concatenate((this_sampled_index, lucky_index)) this_sampled_index = this_sampled_index[0:(1+searchsorted(cumsum(total_spaces_in_sample_dataset[this_sampled_index]), diff))] sampled_index = concatenate((sampled_index, this_sampled_index)) else: error_log += "There is nothing to sample from %s and no new development will happen for " % sample_from_dataset.get_dataset_name() + \ ','.join([col+"="+str(criterion[col]) for col in column_names]) + '\n' #if diff < 0: #TODO demolition; not yet supported ##log status action = "0" if this_sampled_index.size > 0: action_num = total_spaces_in_sample_dataset[this_sampled_index].sum() if diff > 0: action = "+" + str(action_num) if diff < 0: action = "-" + str(action_num) cat = [ str(criterion[col]) for col in column_names] cat += [str(actual_num), str(target_num), str(diff), action] if PrettyTable is not None: status_log.add_row(cat) else: logger.log_status("\t".join(cat)) if PrettyTable is not None: logger.log_status("\n" + status_log.get_string()) if error_log: logger.log_error(error_log) result_data = {} result_dataset = None index = array([], dtype='int32') if sampled_index.size > 0: ### ideally duplicate_rows() is all needed to add newly cloned rows ### to be more cautious, copy the data to be cloned, remove elements, then append the cloned data ##realestate_dataset.duplicate_rows(sampled_index) result_data.setdefault(year_built, resize(year, sampled_index.size).astype('int32')) for attribute in sample_from_dataset.get_primary_attribute_names(): if reset_attribute_value.has_key(attribute): result_data[attribute] = resize(array(reset_attribute_value[attribute]), sampled_index.size) else: result_data[attribute] = sample_from_dataset.get_attribute_by_index(attribute, sampled_index) storage = StorageFactory().get_storage('dict_storage') storage.write_table(table_name=table_name, table_data=result_data) result_dataset = Dataset(id_name = id_name, in_storage = storage, in_table_name = table_name, dataset_name = dataset_name ) index = arange(result_dataset.size()) if append_to_realestate_dataset: if len(result_data) > 0: index = realestate_dataset.add_elements(result_data, require_all_attributes=False, change_ids_if_not_unique=True) result_dataset = realestate_dataset return (result_dataset, index)