예제 #1
0
    def create_prediction_success_table(self, 
                                        summarize_by=None, 
                                        predicted_choice_id_name=None,
                                        predicted_choice_id_prefix="predicted_",
                                        log_to_file=None,
                                        force_predict=True):
        agents = self.get_agent_set()
        choices = self.get_choice_set()
        choice_id_name = choices.get_id_name()[0]
        
        if self.agents_index_for_prediction is not None:
            agents_index = self.agents_index_for_prediction
        else:
            agents_index = self.get_agent_set_index()

        if predicted_choice_id_name is None or len(predicted_choice_id_name) == 0:
            predicted_choice_id_name = predicted_choice_id_prefix + choice_id_name
            
        if force_predict or (predicted_choice_id_name not in agents.get_known_attribute_names()):
            if not self.predict(predicted_choice_id_name=predicted_choice_id_name,
                                agents_index=agents_index
                                ):
                logger.log_error("Failed to run simulation for prediction; unable to create prediction success table.")
                return

        if log_to_file is not None and len(log_to_file) > 0:
            logger.enable_file_logging(log_to_file)
            
        ## by default, compare predicted choice with observed choice
        ## this is not feasible for location choice model, where the 
        ## alternative set is too large to be useful
        if summarize_by is None:
            summarize_by = "%s.%s" % (agents.dataset_name, choice_id_name)
            
        summarize_dataset_name = VariableName(summarize_by).get_dataset_name()
        if summarize_dataset_name == choices.dataset_name:
            summary_id = choices.compute_variables(summarize_by)
            
            chosen_choice_id = agents.get_attribute_by_index(choices.get_id_name()[0], agents_index)
            predicted_choice_id = agents.get_attribute_by_index(predicted_choice_id_name, agents_index)
            chosen_choice_index = choices.get_id_index(chosen_choice_id)
            predicted_choice_index = choices.try_get_id_index(predicted_choice_id)
            
            chosen_summary_id = summary_id[chosen_choice_index]
            predicted_summary_id = summary_id[predicted_choice_index]
    
            unique_summary_id = unique(summary_id)
        elif summarize_dataset_name == agents.dataset_name:
            chosen_summary_id = agents.compute_variables(summarize_by)[agents_index]
            
            chosen_choice_id = agents.get_attribute(choice_id_name).copy()
            predicted_choice_id = agents.get_attribute(predicted_choice_id_name)
            agents.modify_attribute(name=choice_id_name, data=predicted_choice_id)
            predicted_summary_id = agents.compute_variables(summarize_by)[agents_index]
            
            agents.modify_attribute(name=choice_id_name, data=chosen_choice_id)
    
            unique_summary_id = unique( concatenate((chosen_summary_id, predicted_summary_id)) )
        else:
            logger.log_error("summarize_by expression '%s' is specified for dataset %s, which is neither the choice_set '%s' nor the agent_set '%s'." 
                             % (summarize_by, summarize_dataset_name, choices.dataset_name, agents.dataset_name))
            return False

        unique_nonneg_summary_id = unique_summary_id[unique_summary_id >= 0] 
        # observed on row, predicted on column
        prediction_matrix = zeros( (unique_nonneg_summary_id.size, unique_nonneg_summary_id.size), dtype="int32" )

        def _convert_array_to_tab_delimited_string(an_array):
            from numpy import dtype
            if an_array.dtype == dtype('f'):
                return "\t".join(["%5.4f" % item for item in an_array])
            return "\t".join([str(item) for item in an_array])
        
        logger.log_status("Observed_id\tSuccess_rate\t%s" % \
                          _convert_array_to_tab_delimited_string(unique_nonneg_summary_id) )
        i = 0
        total_correct = 0
        success_rate = zeros( unique_nonneg_summary_id.size, dtype="float32" )
        for observed_id in unique_nonneg_summary_id:
            predicted_id = predicted_summary_id[chosen_summary_id==observed_id]
            prediction_matrix[i] = ndimage.sum(ones(predicted_id.size), labels=predicted_id, index=unique_nonneg_summary_id )
            if prediction_matrix[i].sum() > 0:
                if prediction_matrix[i].sum() > 0:
                    success_rate[i] = float(prediction_matrix[i, i]) / prediction_matrix[i].sum()
                    total_correct = total_correct + prediction_matrix[i, i]
                else:
                    success_rate[i] = 0
            logger.log_status("%s\t\t%5.4f\t\t%s" % (observed_id, success_rate[i], 
                                              _convert_array_to_tab_delimited_string(prediction_matrix[i]) ) )
            i+=1

        success_rate2 = zeros( i, dtype="float32" )
        for j in range(i):
            if prediction_matrix[j, :].sum() > 0:
                success_rate2[j]=float(prediction_matrix[:,j].sum()) / prediction_matrix[j, :].sum()
            else:
                success_rate2[j]=0
        logger.log_status("%s\t\t%s\t\t%s" % (' ', ' ', 
                                                 _convert_array_to_tab_delimited_string( success_rate2 ) ))
        logger.log_status("\nTotal success rate: %5.4f" % (total_correct/float(prediction_matrix.sum())))
        logger.disable_file_logging(filename=log_to_file)
    def run(self, realestate_dataset,
            year=None, 
            occupied_spaces_variable="occupied_units",
            total_spaces_variable="total_units",
            target_attribute_name='target_vacancy_rate',
            sample_from_dataset = None,
            sample_filter="",
            reset_attribute_value={}, 
            year_built = 'year_built',
            dataset_pool=None,
            append_to_realestate_dataset = False,
            table_name = "development_projects",
            dataset_name = "development_project",
            id_name = [],
            **kwargs):
        """         
        sample_filter attribute/variable indicates which records in the dataset are eligible in the sampling for removal or cloning
        append_to_realestate_dataset - whether to append the new dataset to realestate_dataset
        """
        
        if self.target_vancy_dataset is None:
            raise RuntimeError, "target_vacancy_rate dataset is unspecified."
        
        if not sample_from_dataset:
            sample_from_dataset = realestate_dataset
            
        #if dataset_pool is None:
        #    dataset_pool = SessionConfiguration().get_dataset_pool()
        if year is None:
            year = SimulationState().get_current_time()
        this_year_index = where(self.target_vancy_dataset.get_attribute('year')==year)[0]
        target_vacancy_for_this_year = DatasetSubset(self.target_vancy_dataset, this_year_index)
        
        column_names = list(set( self.target_vancy_dataset.get_known_attribute_names() ) - set( [ target_attribute_name, occupied_spaces_variable, total_spaces_variable, 'year', '_hidden_id_'] ))
        column_names.sort(reverse=True)
        column_values = dict([ (name, target_vacancy_for_this_year.get_attribute(name)) for name in column_names + [target_attribute_name]])
        
        independent_variables = list(set([re.sub('_max$', '', re.sub('_min$', '', col)) for col in column_names]))
        dataset_known_attributes = realestate_dataset.get_known_attribute_names()
        for variable in independent_variables:
            if variable not in dataset_known_attributes:
                realestate_dataset.compute_one_variable_with_unknown_package(variable, dataset_pool=dataset_pool)
                sample_from_dataset.compute_one_variable_with_unknown_package(variable, dataset_pool=dataset_pool)
                
        dataset_known_attributes = realestate_dataset.get_known_attribute_names() #update after compute
        if sample_filter:
            short_name = VariableName(sample_filter).get_alias()
            if short_name not in dataset_known_attributes:
                filter_indicator = sample_from_dataset.compute_variables(sample_filter, dataset_pool=dataset_pool)
            else:
                filter_indicator = sample_from_dataset.get_attribute(short_name)
        else:
            filter_indicator = 1
                
        sampled_index = array([], dtype=int32)

        #log header
        if PrettyTable is not None:
            status_log = PrettyTable()
            status_log.set_field_names(column_names + ["actual", "target", "difference", "action"])
        else:
            logger.log_status("\t".join(column_names + ["actual", "target", "difference", "action"]))
        error_log = ''
        for index in range(target_vacancy_for_this_year.size()):
            this_sampled_index = array([], dtype=int32)
            indicator = ones( realestate_dataset.size(), dtype='bool' )
            sample_indicator = ones( sample_from_dataset.size(), dtype='bool' )
            criterion = {}   # for logging
            for attribute in independent_variables:
                if attribute in dataset_known_attributes:
                    dataset_attribute = realestate_dataset.get_attribute(attribute)
                    sample_attribute = sample_from_dataset.get_attribute(attribute)
                else:
                    raise ValueError, "attribute %s used in target vacancy dataset can not be found in dataset %s" % (attribute, realestate_dataset.get_dataset_name())
                
                if attribute + '_min' in column_names:
                    amin = target_vacancy_for_this_year.get_attribute(attribute+'_min')[index] 
                    criterion.update({attribute + '_min':amin})
                    if amin != -1:
                        indicator *= dataset_attribute >= amin
                        sample_indicator *= sample_attribute >= amin
                if attribute + '_max' in column_names: 
                    amax = target_vacancy_for_this_year.get_attribute(attribute+'_max')[index]
                    criterion.update({attribute + '_max':amax}) 
                    if amax != -1:
                        indicator *= dataset_attribute <= amax
                        sample_indicator *= sample_attribute <= amax
                if attribute in column_names: 
                    aval = column_values[attribute][index] 
                    criterion.update({attribute:aval}) 
                    if aval == -1:
                        continue
                    elif aval == -2:  ##treat -2 in control totals column as complement set, i.e. all other values not already specified in this column
                        indicator *= logical_not(ismember(dataset_attribute, column_values[attribute]))
                        sample_indicator *= logical_not(ismember(sample_attribute, column_values[attribute]))
                    else:
                        indicator *= dataset_attribute == aval
                        sample_indicator *= sample_attribute == aval
                        
            this_total_spaces_variable, this_occupied_spaces_variable = total_spaces_variable, occupied_spaces_variable
            ## total/occupied_spaces_variable can be specified either as a universal name for all realestate 
            ## or in targe_vacancy_rate dataset for each vacancy category
            if occupied_spaces_variable in target_vacancy_for_this_year.get_known_attribute_names():
                this_occupied_spaces_variable = target_vacancy_for_this_year.get_attribute(occupied_spaces_variable)[index]

            if total_spaces_variable in target_vacancy_for_this_year.get_known_attribute_names():
                this_total_spaces_variable = target_vacancy_for_this_year.get_attribute(total_spaces_variable)[index]
            
            logger.be_quiet() #temporarily disable logging
            realestate_dataset.compute_one_variable_with_unknown_package(this_occupied_spaces_variable, dataset_pool=dataset_pool)
            realestate_dataset.compute_one_variable_with_unknown_package(this_total_spaces_variable, dataset_pool=dataset_pool)
            sample_from_dataset.compute_one_variable_with_unknown_package(this_total_spaces_variable, dataset_pool=dataset_pool)
            logger.talk()
            
            actual_num = (indicator * realestate_dataset.get_attribute(this_total_spaces_variable)).sum()
            target_num = int(round( (indicator * realestate_dataset.get_attribute(this_occupied_spaces_variable)).sum() /\
                                    (1 - target_vacancy_for_this_year.get_attribute(target_attribute_name)[index]) 
                            ))
            diff = target_num - actual_num
            if diff > 0:
                total_spaces_in_sample_dataset = sample_from_dataset.get_attribute(this_total_spaces_variable)
                legit_index = where(logical_and(sample_indicator, filter_indicator) * total_spaces_in_sample_dataset > 0)[0]
                if legit_index.size > 0:
                    mean_size = total_spaces_in_sample_dataset[legit_index].mean()
                    num_of_projects_to_sample = int( diff / mean_size )
                    while total_spaces_in_sample_dataset[this_sampled_index].sum() < diff:
                        lucky_index = sample_replace(legit_index, num_of_projects_to_sample)
                        this_sampled_index = concatenate((this_sampled_index, lucky_index))
                    this_sampled_index = this_sampled_index[0:(1+searchsorted(cumsum(total_spaces_in_sample_dataset[this_sampled_index]), diff))]
                    sampled_index = concatenate((sampled_index, this_sampled_index))
                else:
                    error_log += "There is nothing to sample from %s and no new development will happen for " % sample_from_dataset.get_dataset_name() + \
                              ','.join([col+"="+str(criterion[col]) for col in column_names]) + '\n'
            #if diff < 0: #TODO demolition; not yet supported
            
            ##log status
            action = "0"
            if this_sampled_index.size > 0:
                action_num = total_spaces_in_sample_dataset[this_sampled_index].sum()
                if diff > 0: action = "+" + str(action_num)
                if diff < 0: action = "-" + str(action_num)
            cat = [ str(criterion[col]) for col in column_names]
            cat += [str(actual_num), str(target_num), str(diff), action]
            
            if PrettyTable is not None:
                status_log.add_row(cat)
            else:                
                logger.log_status("\t".join(cat))
            
        if PrettyTable is not None:
            logger.log_status("\n" + status_log.get_string())
        if error_log:
            logger.log_error(error_log)
            
        result_data = {}
        result_dataset = None
        index = array([], dtype='int32')
        if sampled_index.size > 0:
            ### ideally duplicate_rows() is all needed to add newly cloned rows
            ### to be more cautious, copy the data to be cloned, remove elements, then append the cloned data
            ##realestate_dataset.duplicate_rows(sampled_index)
            result_data.setdefault(year_built, resize(year, sampled_index.size).astype('int32'))
            for attribute in sample_from_dataset.get_primary_attribute_names():
                if reset_attribute_value.has_key(attribute):
                    result_data[attribute] = resize(array(reset_attribute_value[attribute]), sampled_index.size)
                else:
                    result_data[attribute] = sample_from_dataset.get_attribute_by_index(attribute, sampled_index)
        
            storage = StorageFactory().get_storage('dict_storage')
            storage.write_table(table_name=table_name, table_data=result_data)
    
            result_dataset = Dataset(id_name = id_name,
                                      in_storage = storage,
                                      in_table_name = table_name,
                                      dataset_name = dataset_name
                                      )
            index = arange(result_dataset.size())
            
        if append_to_realestate_dataset:
            if len(result_data) > 0:
                index = realestate_dataset.add_elements(result_data, require_all_attributes=False,
                                                        change_ids_if_not_unique=True)                
            result_dataset = realestate_dataset
        
        return (result_dataset, index)