def run(self, individual_dataset, fraction_dataset, id_name1='blockgroup_id', 
         id_name2='zone_id', fraction_attribute_name='fraction'):
     
     """
     """
     assert id_name1 in individual_dataset.get_known_attribute_names()
     if id_name2 not in individual_dataset.get_known_attribute_names():           
         individual_dataset.add_primary_attribute(-1*ones(individual_dataset.size()), id_name2)
     fraction_id1 = fraction_dataset.get_attribute(id_name1)
     individual_id1 = individual_dataset.get_attribute(id_name1)
     unique_ids = unique(fraction_id1)
     
     for id1 in unique_ids:
         individual_of_id1 = where(individual_id1==id1)[0]
         n = individual_of_id1.size
         logger.log_status("Processing %s %s: %s individuals" % (id_name1, id1, n) )
         if n > 0:
             fractions = fraction_dataset.get_attribute(fraction_attribute_name)[fraction_id1==id1]
             id2 = fraction_dataset.get_attribute(id_name2)[fraction_id1==id1]
             ## ignore households in geography with sum of fractions less than 1.0e-6
             if fractions.sum() < 1.0e-2:
                 continue
             if not allclose(fractions.sum(), 1.0, rtol=1.e-2):
                 fractions = normalize(fractions)
             fractions_cumsum = ncumsum(fractions)
             R = random(n)
             index = searchsorted(fractions_cumsum, R)
             individual_dataset.modify_attribute(id_name2, id2[index], index=individual_of_id1)
Exemplo n.º 2
0
    def _sample_by_agent_and_stratum(self, index1, index2, stratum, prob_array,
                                     chosen_choice_index,
                                     strata_sample_setting):
        """agent by agent and stratum by stratum stratified sampling, suitable for 2d prob_array and/or sample_size varies for agents
        this method is slower than _sample_by_stratum, for simpler stratified sampling use _sample_by_stratum instead"""

        rank_of_prob = rank(prob_array)
        rank_of_strata = rank(strata_sample_setting)

        J = self.__determine_sampled_index_size(strata_sample_setting,
                                                rank_of_strata)
        sampled_index = zeros((index1.size, J), dtype=DTYPE) - 1
        self._sampling_probability = zeros((index1.size, J), dtype=float32)
        self._stratum_id = ones((index1.size, J), dtype=DTYPE) * NO_STRATUM_ID

        for i in range(index1.size):
            if rank_of_strata == 3:
                strata_sample_pairs = strata_sample_setting[i, :]
            else:
                strata_sample_pairs = strata_sample_setting

            if rank_of_prob == 2:
                prob = prob_array[i, :]
            else:
                prob = prob_array

            j = 0
            for (this_stratum, this_size) in strata_sample_pairs:
                if this_size <= 0: continue
                index_not_in_stratum = where(stratum != this_stratum)[0]
                this_prob = copy.copy(prob)

                this_prob[index_not_in_stratum] = 0.0
                this_prob = normalize(this_prob)

                if nonzerocounts(this_prob) < this_size:
                    logger.log_warning(
                        "weight array dosen't have enough non-zero counts, use sample with replacement"
                    )


#                chosen_index_to_index2 = where(index2 == chosen_choice_index[i])[0]
#exclude_index passed to probsample_noreplace needs to be indexed to index2
                this_sampled_index = probsample_noreplace(
                    index2,
                    sample_size=this_size,
                    prob_array=this_prob,
                    exclude_index=chosen_choice_index[i],
                    return_index=True)
                sampled_index[i, j:j + this_size] = this_sampled_index

                self._sampling_probability[
                    i, j:j + this_size] = this_prob[this_sampled_index]
                self._stratum_id[i, j:j + this_size] = ones(
                    (this_sampled_index.size, ), dtype=DTYPE) * this_stratum

                j += this_size

        return index2[sampled_index]
    def _sample_by_agent_and_stratum(
        self, index1, index2, stratum, prob_array, chosen_choice_index, strata_sample_setting
    ):
        """agent by agent and stratum by stratum stratified sampling, suitable for 2d prob_array and/or sample_size varies for agents
        this method is slower than _sample_by_stratum, for simpler stratified sampling use _sample_by_stratum instead"""

        rank_of_prob = rank(prob_array)
        rank_of_strata = rank(strata_sample_setting)

        J = self.__determine_sampled_index_size(strata_sample_setting, rank_of_strata)
        sampled_index = zeros((index1.size, J), dtype="int32") - 1
        self._sampling_probability = zeros((index1.size, J), dtype=float32)
        self._stratum_id = ones((index1.size, J), dtype="int32") * NO_STRATUM_ID

        for i in range(index1.size):
            if rank_of_strata == 3:
                strata_sample_pairs = strata_sample_setting[i, :]
            else:
                strata_sample_pairs = strata_sample_setting

            if rank_of_prob == 2:
                prob = prob_array[i, :]
            else:
                prob = prob_array

            j = 0
            for (this_stratum, this_size) in strata_sample_pairs:
                if this_size <= 0:
                    continue
                index_not_in_stratum = where(stratum != this_stratum)[0]
                this_prob = copy.copy(prob)

                this_prob[index_not_in_stratum] = 0.0
                this_prob = normalize(this_prob)

                if nonzerocounts(this_prob) < this_size:
                    logger.log_warning("weight array dosen't have enough non-zero counts, use sample with replacement")

                #                chosen_index_to_index2 = where(index2 == chosen_choice_index[i])[0]
                # exclude_index passed to probsample_noreplace needs to be indexed to index2
                this_sampled_index = probsample_noreplace(
                    index2,
                    sample_size=this_size,
                    prob_array=this_prob,
                    exclude_index=chosen_choice_index[i],
                    return_index=True,
                )
                sampled_index[i, j : j + this_size] = this_sampled_index

                self._sampling_probability[i, j : j + this_size] = this_prob[this_sampled_index]
                self._stratum_id[i, j : j + this_size] = ones((this_sampled_index.size,), dtype="int32") * this_stratum

                j += this_size

        return index2[sampled_index]
Exemplo n.º 4
0
    def _sample_by_stratum(self, index1, index2, stratum, prob_array,
                           chosen_choice_index, strata_sample_setting):
        """stratum by stratum stratified sampling, suitable for 1d prob_array and sample_size is the same for all agents"""
        if prob_array.ndim <> 1:
            raise RuntimeError, "_sample_by_stratum only suitable for 1d prob_array"

        sampled_index = zeros((index1.size, 1), dtype=DTYPE) - 1
        self._sampling_probability = zeros((index1.size, 1), dtype=float32)
        self._stratum_id = ones((index1.size, 1), dtype=DTYPE) * NO_STRATUM_ID
        for this_stratum, this_size in strata_sample_setting:
            index_not_in_stratum = where(stratum != this_stratum)[0]
            this_prob = copy.copy(prob_array)

            this_prob[index_not_in_stratum] = 0.0
            this_prob = normalize(this_prob)

            replace = False  # non-repeat sampling
            if nonzerocounts(this_prob) < this_size:
                logger.log_warning(
                    "weight array dosen't have enough non-zero counts, sample with replacement"
                )
                replace = True

            this_sampled_index = prob2dsample(
                index2,
                sample_size=(index1.size, this_size),
                prob_array=this_prob,
                exclude_index=chosen_choice_index,
                replace=replace,
                return_index=True)
            sampled_index = concatenate((sampled_index, this_sampled_index),
                                        axis=1)
            self._sampling_probability = concatenate(
                (self._sampling_probability, this_prob[this_sampled_index]),
                axis=1)
            self._stratum_id = concatenate(
                (self._stratum_id,
                 ones((this_sampled_index.shape[0], 1), dtype=DTYPE) *
                 this_stratum),
                axis=1)

        self._sampling_probability = self._sampling_probability[:, 1:]
        self._stratum_id = self._stratum_id[:, 1:]
        return index2[sampled_index[:, 1:]]
    def _sample_by_stratum(self, index1, index2, stratum, prob_array, chosen_choice_index, strata_sample_setting):
        """stratum by stratum stratified sampling, suitable for 1d prob_array and sample_size is the same for all agents"""
        if prob_array.ndim <> 1:
            raise RuntimeError, "_sample_by_stratum only suitable for 1d prob_array"

        sampled_index = zeros((index1.size, 1), dtype="int32") - 1
        self._sampling_probability = zeros((index1.size, 1), dtype=float32)
        self._stratum_id = ones((index1.size, 1), dtype="int32") * NO_STRATUM_ID
        for this_stratum, this_size in strata_sample_setting:
            index_not_in_stratum = where(stratum != this_stratum)[0]
            this_prob = copy.copy(prob_array)

            this_prob[index_not_in_stratum] = 0.0
            this_prob = normalize(this_prob)

            replace = False  # non-repeat sampling
            if nonzerocounts(this_prob) < this_size:
                logger.log_warning("weight array dosen't have enough non-zero counts, sample with replacement")
                replace = True

            this_sampled_index = prob2dsample(
                index2,
                sample_size=(index1.size, this_size),
                prob_array=this_prob,
                exclude_index=chosen_choice_index,
                replace=replace,
                return_index=True,
            )
            sampled_index = concatenate((sampled_index, this_sampled_index), axis=1)
            self._sampling_probability = concatenate(
                (self._sampling_probability, this_prob[this_sampled_index]), axis=1
            )
            self._stratum_id = concatenate(
                (self._stratum_id, ones((this_sampled_index.shape[0], 1), dtype="int32") * this_stratum), axis=1
            )

        self._sampling_probability = self._sampling_probability[:, 1:]
        self._stratum_id = self._stratum_id[:, 1:]
        return index2[sampled_index[:, 1:]]
Exemplo n.º 6
0
    def run(self, individual_dataset, fraction_dataset, id_name1='blockgroup_id', 
            id_name2='zone_id', fraction_attribute_name='fraction', dataset_pool=None):
        
        """
        """
        if dataset_pool is None:
            dataset_pool = SessionConfiguration().get_dataset_pool()
        if isinstance(individual_dataset, str):
            individual_dataset = dataset_pool[individual_dataset]
        if isinstance(fraction_dataset, str):
            fraction_dataset = dataset_pool[fraction_dataset]

        assert id_name1 in individual_dataset.get_known_attribute_names()
        if id_name2 not in individual_dataset.get_known_attribute_names():
            dtype = fraction_dataset.get_attribute(id_name2).dtype
            default_values = -1*ones(individual_dataset.size(), dtype=dtype)
            individual_dataset.add_primary_attribute(default_values, id_name2)
        fraction_id1 = fraction_dataset.get_attribute(id_name1)
        individual_id1 = individual_dataset.get_attribute(id_name1)
        unique_ids = unique(fraction_id1)
        
        for id1 in unique_ids:
            individual_of_id1 = where(individual_id1==id1)[0]
            n = individual_of_id1.size
            logger.log_status("Processing %s %s: %s individuals" % (id_name1, id1, n) )
            if n > 0:
                fractions = fraction_dataset.get_attribute(fraction_attribute_name)[fraction_id1==id1]
                id2 = fraction_dataset.get_attribute(id_name2)[fraction_id1==id1]
                ## ignore individuals in geography with sum of fractions less than 1.0e-2
                if fractions.sum() < 1.0e-2:
                    continue
                if not allclose(fractions.sum(), 1.0, rtol=1.e-2):
                    fractions = normalize(fractions)
                fractions_cumsum = ncumsum(fractions)
                R = random(n)
                index = searchsorted(fractions_cumsum, R)
                individual_dataset.modify_attribute(id_name2, id2[index], index=individual_of_id1)
                
        individual_dataset.flush_dataset()
    def run(self,
            individual_dataset,
            fraction_dataset,
            id_name1='blockgroup_id',
            id_name2='zone_id',
            fraction_attribute_name='fraction'):
        """
        """
        assert id_name1 in individual_dataset.get_known_attribute_names()
        if id_name2 not in individual_dataset.get_known_attribute_names():
            individual_dataset.add_primary_attribute(
                -1 * ones(individual_dataset.size()), id_name2)
        fraction_id1 = fraction_dataset.get_attribute(id_name1)
        individual_id1 = individual_dataset.get_attribute(id_name1)
        unique_ids = unique(fraction_id1)

        for id1 in unique_ids:
            individual_of_id1 = where(individual_id1 == id1)[0]
            n = individual_of_id1.size
            logger.log_status("Processing %s %s: %s individuals" %
                              (id_name1, id1, n))
            if n > 0:
                fractions = fraction_dataset.get_attribute(
                    fraction_attribute_name)[fraction_id1 == id1]
                id2 = fraction_dataset.get_attribute(id_name2)[fraction_id1 ==
                                                               id1]
                ## ignore households in geography with sum of fractions less than 1.0e-6
                if fractions.sum() < 1.0e-2:
                    continue
                if not allclose(fractions.sum(), 1.0, rtol=1.e-2):
                    fractions = normalize(fractions)
                fractions_cumsum = ncumsum(fractions)
                R = random(n)
                index = searchsorted(fractions_cumsum, R)
                individual_dataset.modify_attribute(id_name2,
                                                    id2[index],
                                                    index=individual_of_id1)
    def run(self, data, upc_sequence, resources=None):

        self.mnl_probabilities=upc_sequence.probability_class
        self.bhhh_estimation = bhhh_mnl_estimation()

        modified_upc_sequence = UPCFactory().get_model(
            utilities=None, probabilities="opus_core.mnl_probabilities", choices=None)
        modified_upc_sequence.utility_class = upc_sequence.utility_class

        N, neqs, V = data.shape

        max_iter = resources.get("max_iterations", 100)  # default
        sc = SessionConfiguration()
        dataset_pool = sc.get_dataset_pool()
        sample_rate = dataset_pool.get_dataset("sample_rate")
        
        CLOSE = sc["CLOSE"]
        info_filename = sc["info_file"]
        info_filename = os.path.join('.', info_filename)
        info_file = open(info_filename, "a")
        constraint_dict = {1:'constrained', 0:'unconstrained'}
        swing_cases_fix = 0  #set swing alternatives to constrained (1) or unconstrained (0)
        prob_correlation = None
        
        choice_set = resources['_model_'].choice_set
        J = choice_set.size()
        alt_id = choice_set.get_id_attribute()
        movers = choice_set.get_attribute('movers')

        resources.check_obligatory_keys(["capacity_string"])
        supply = choice_set.get_attribute(resources["capacity_string"])

        index = resources.get("index", None)
        if index is None: # no sampling case, alternative set is the full choice_set
            index = arange(J)
        if index.ndim <= 1:
            index = repeat(index[newaxis,:], N, axis=0)

        if resources.get('aggregate_to_dataset', None):
            aggregate_dataset = dataset_pool.get_dataset(resources.get('aggregate_to_dataset'))
            choice_set_aggregate_id = choice_set.get_attribute(aggregate_dataset.get_id_name()[0])
            index = aggregate_dataset.get_id_index(choice_set_aggregate_id[index].ravel()).reshape(index.shape)

            supply = aggregate_dataset.get_attribute(resources["capacity_string"])
            J = aggregate_dataset.size()

            movers = aggregate_dataset.get_attribute("movers")

        demand_history = movers[:, newaxis]
        resources.merge({"index":index})
        
        pi = ones(index.shape, dtype=float32)  #initialize pi
        #average_omega = ones(J,dtype=float32)  #initialize average_omega
        logger.start_block('Outer Loop')
        for i in range(max_iter):
            logger.log_status('Outer Loop Iteration %s' % i)

            result = self.bhhh_estimation.run(data, modified_upc_sequence, resources)
            del self.bhhh_estimation; collect()
            self.bhhh_estimation = bhhh_mnl_estimation()

            probability = modified_upc_sequence.get_probabilities()
            if data.shape[2] == V:  #insert a placeholder for ln(pi) in data
                data = concatenate((data,ones((N,neqs,1),dtype=float32)), axis=2)
                coef_names = resources.get("coefficient_names")
                coef_names = concatenate( (coef_names, array(["ln_pi"])) )
                resources.merge({"coefficient_names":coef_names})
            else:
                beta_ln_pi = result['estimators'][where(coef_names == 'ln_pi')][0]
                logger.log_status("mu = 1/%s = %s" % (beta_ln_pi, 1/beta_ln_pi))
                
                prob_hat = safe_array_divide(probability, pi ** beta_ln_pi)
                #prob_hat = safe_array_divide(probability, pi)
                prob_hat_sum = prob_hat.sum(axis=1, dtype=float32)
                if not ma.allclose(prob_hat_sum, 1.0):
                    logger.log_status("probability doesn't sum up to 1, with minimum %s, and maximum %s" %
                                      (prob_hat_sum.min(), prob_hat_sum.max()))
                    
                    probability = normalize(prob_hat)

            demand = self.mnl_probabilities.get_demand(index, probability, J) * 1 / sample_rate
            demand_history = concatenate((demand_history,
                                          demand[:, newaxis]),
                                          axis=1)

            sdratio = safe_array_divide(supply, demand, return_value_if_denominator_is_zero=2.0)
            sdratio_matrix = sdratio[index]
            ## debug info
            from numpy import histogram 
            from opus_core.misc import unique
            cc = histogram(index.ravel(), unique(index.ravel()))[0]
            logger.log_status( "=================================================================")
            logger.log_status( "Probability min: %s, max: %s" % (probability.min(), probability.max()) )
            logger.log_status( "Demand min: %s, max: %s" % (demand.min(), demand.max()) )
            logger.log_status( "sdratio min: %s, max: %s" % (sdratio.min(), sdratio.max()) )
            logger.log_status( "demand[sdratio==sdratio.min()]=%s" % demand[sdratio==sdratio.min()] )
            logger.log_status( "demand[sdratio==sdratio.max()]=%s" % demand[sdratio==sdratio.max()] )
            logger.log_status( "Counts of unique submarkets in alternatives min: %s, max: %s" % (cc.min(), cc.max()) )
            logger.log_status( "=================================================================")

            constrained_locations_matrix, omega, info = self.inner_loop(supply, demand, probability,
                                                                        index, sdratio_matrix,
                                                                        J, max_iteration=max_iter)

            inner_iterations, constrained_locations_history, swing_index, average_omega_history = info
    
            for idx in swing_index:
                logger.log_status("swinging alt with id %s set to %s" % (alt_id[idx], constraint_dict[swing_cases_fix]))
                constrained_locations_matrix[index==idx] = swing_cases_fix
    
            if swing_index.size > 0:    
                info_file.write("swing of constraints found with id %s \n" % alt_id[swing_index])
                info_file.write("outer_iteration, %i, " % i + ", ".join([str(i)]*(len(inner_iterations))) + "\n")
                info_file.write("inner_iteration, , " + ", ".join(inner_iterations) + "\n")
                info_file.write("id, sdratio, " + ", ".join(["avg_omega"]*len(inner_iterations)) + "\n")
                for idx in swing_index:
                    line = str(alt_id[idx]) + ','
                    line += str(sdratio[idx]) + ','
                    line += ",".join([str(x) for x in average_omega_history[idx,]])
                    line += "\n"
                    info_file.write(line)
    
                info_file.write("\n")
                info_file.flush()

            outer_iterations = [str(i)] * len(inner_iterations)
            prob_min = [str(probability.min())] * len(inner_iterations)
            prob_max = [str(probability.max())] * len(inner_iterations)

            pi_new = self.mnl_probabilities.get_pi(sdratio_matrix, omega, constrained_locations_matrix)

            data[:,:,-1] = ln(pi_new)
            #diagnostic output
            
            if not ma.allclose(pi, pi_new, atol=CLOSE):
                if i > 0:  #don't print this for the first iteration
                    logger.log_status("min of abs(pi(l+1) - pi(l)): %s" % absolute(pi_new - pi).min())
                    logger.log_status("max of abs(pi(l+1) - pi(l)): %s" % absolute(pi_new - pi).max())
                    logger.log_status("mean of pi(l+1) - pi(l): %s" % (pi_new - pi).mean())
                    logger.log_status('Standard Deviation pi(l+1) - pi(l): %s' % standard_deviation(pi_new - pi))
                    logger.log_status('correlation of pi(l+1) and pi(l): %s' % corr(pi_new.ravel(), pi.ravel())[0,1])

                pi = pi_new
                probability_old = probability   # keep probability of the previous loop, for statistics computation only    
            else:   #convergence criterion achieved, quiting outer loop
                logger.log_status("pi(l) == pi(l+1): Convergence criterion achieved")
    
                info_file.write("\nConstrained Locations History:\n")
                info_file.write("outer_iteration," + ",".join(outer_iterations) + "\n")
                info_file.write("inner_iteration," + ",".join(inner_iterations) + "\n")
                info_file.write("minimum_probability," + ",".join(prob_min) + "\n")
                info_file.write("maximum_probability," + ",".join(prob_max) + "\n")
                for row in range(J):
                    line = [str(x) for x in constrained_locations_history[row,]]
                    info_file.write(str(alt_id[row]) + "," + ",".join(line) + "\n")

                info_file.flush()

                info_file.write("\nDemand History:\n")
                i_str = [str(x) for x in range(i)]
                info_file.write("outer_iteration, (movers)," + ",".join(i_str) + "\n")
                #info_file.write(", ,\n")
                for row in range(J):
                    line = [str(x) for x in demand_history[row,]]
                    info_file.write(str(alt_id[row]) + "," + ",".join(line) + "\n")

                demand_history_info_criteria = [500, 100, 50, 20]
                for criterion in demand_history_info_criteria:
                    com_rows_index = where(movers <= criterion)[0]
                    info_file.write("\nDemand History for alternatives with less than or equal to %s movers in 1998:\n" % criterion)
                    i_str = [str(x) for x in range(i)]
                    info_file.write("outer_iteration, (movers)," + ",".join(i_str) + "\n")
                    #info_file.write(", movers,\n")
                    for row in com_rows_index:
                        line = [str(x) for x in demand_history[row,]]
                        info_file.write(str(alt_id[row]) + "," + ",".join(line) + "\n")

                #import pdb; pdb.set_trace()
                #export prob correlation history
                correlation_indices, prob_correlation = self.compute_prob_correlation(probability_old, probability, prob_hat, index, resources)

                info_file.write("\nCorrelation of Probabilities:\n")
                c_name = ['corr(p_ij p~_ij)', 'corr(p_ij p^_ij)', 'corr(p_ij dummy)', 'corr(p~_ij p^_ij)', 'corr(p~_ij dummy)', 'corr(p^_ij dummy)']

                info_file.write("com_id, " + ",".join(c_name) + "\n")

                #info_file.write(", ,\n")
                for row in range(correlation_indices.size):
                    line = [str(x) for x in prob_correlation[row,]]
                    info_file.write(str(alt_id[correlation_indices[row]]) + "," + ",".join(line) + "\n")

                info_file.close()

                result['pi'] = pi
                return result

        logger.end_block()
        try:info_file.close()
        except:pass

        raise RuntimeError, "max iteration reached without convergence."
Exemplo n.º 9
0
    def run(self,
            dataset1,
            dataset2,
            index1=None,
            index2=None,
            sample_size=10,
            weight=None,
            include_chosen_choice=False,
            with_replacement=False,
            resources=None,
            dataset_pool=None):
        """this function samples number of sample_size (scalar value) alternatives from dataset2
        for agent set specified by dataset1.
        If index1 is not None, only samples alterantives for agents with indices in index1;
        if index2 is not None, only samples alternatives from indices in index2.
        sample_size specifies number of alternatives to be sampled for each agent.
        weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d
        array of the same length as index2 or 2d array of shape (index1.size, index2.size).

        Also refer to document of interaction_dataset"""

        if dataset_pool is None:
            try:
                sc = SessionConfiguration()
                dataset_pool = sc.get_dataset_pool()
            except:
                dataset_pool = DatasetPool()

        local_resources = Resources(resources)
        local_resources.merge_if_not_None({
            "dataset1":
            dataset1,
            "dataset2":
            dataset2,
            "index1":
            index1,
            "index2":
            index2,
            "sample_size":
            sample_size,
            "weight":
            weight,
            "with_replacement":
            with_replacement,
            "include_chosen_choice":
            include_chosen_choice
        })

        local_resources.check_obligatory_keys(
            ['dataset1', 'dataset2', 'sample_size'])
        agent = local_resources["dataset1"]
        index1 = local_resources.get("index1", None)
        if index1 is None:
            index1 = arange(agent.size())
        choice = local_resources["dataset2"]
        index2 = local_resources.get("index2", None)
        if index2 is None:
            index2 = arange(choice.size())

        if index1.size == 0 or index2.size == 0:
            err_msg = "either choice size or agent size is zero, return None"
            logger.log_warning(err_msg)
            return None

        include_chosen_choice = local_resources.get("include_chosen_choice",
                                                    False)
        J = local_resources["sample_size"]
        if include_chosen_choice:
            J = J - 1

        with_replacement = local_resources.get("with_replacement")

        weight = local_resources.get("weight", None)
        if isinstance(weight, str):
            if weight in choice.get_known_attribute_names():
                weight = choice.get_attribute(weight)
                rank_of_weight = 1
            else:
                varname = VariableName(weight)
                if varname.get_dataset_name() == choice.get_dataset_name():
                    weight = choice.compute_variables(
                        weight, dataset_pool=dataset_pool)
                    rank_of_weight = 1
                elif varname.get_interaction_set_names() is not None:
                    ## weights can be an interaction variable
                    interaction_dataset = InteractionDataset(local_resources)
                    weight = interaction_dataset.compute_variables(
                        weight, dataset_pool=dataset_pool)
                    rank_of_weight = 2
                    assert (len(weight.shape) >= rank_of_weight)
                else:
                    err_msg = ("weight is neither a known attribute name "
                               "nor a simple variable from the choice dataset "
                               "nor an interaction variable: '%s'" % weight)
                    logger.log_error(err_msg)
                    raise ValueError, err_msg
        elif isinstance(weight, ndarray):
            rank_of_weight = weight.ndim
        elif not weight:  ## weight is None or empty string
            weight = ones(index2.size)
            rank_of_weight = 1
        else:
            err_msg = "unkown weight type"
            logger.log_error(err_msg)
            raise TypeError, err_msg

        if (weight.size <> index2.size) and (weight.shape[rank_of_weight - 1]
                                             <> index2.size):
            if weight.shape[rank_of_weight - 1] == choice.size():
                if rank_of_weight == 1:
                    weight = take(weight, index2)
                if rank_of_weight == 2:
                    weight = take(weight, index2, axis=1)
            else:
                err_msg = "weight array size doesn't match to size of dataset2 or its index"
                logger.log_error(err_msg)
                raise ValueError, err_msg

        prob = normalize(weight)

        #chosen_choice = ones(index1.size) * UNPLACED_ID
        chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1]
        #index_of_placed_agent = where(greater(chosen_choice_id, UNPLACED_ID))[0]
        chosen_choice_index = choice.try_get_id_index(
            chosen_choice_id, return_value_if_not_found=UNPLACED_ID)
        chosen_choice_index_to_index2 = lookup(chosen_choice_index,
                                               index2,
                                               index_if_not_found=UNPLACED_ID)

        if rank_of_weight == 1:  # if weight_array is 1d, then each agent shares the same weight for choices
            replace = with_replacement  # sampling with no replacement
            non_zero_counts = nonzerocounts(weight)
            if non_zero_counts < J:
                logger.log_warning(
                    "weight array dosen't have enough non-zero counts, use sample with replacement"
                )
                replace = True
            if non_zero_counts > 0:
                sampled_index = prob2dsample(
                    index2,
                    sample_size=(index1.size, J),
                    prob_array=prob,
                    exclude_index=chosen_choice_index_to_index2,
                    replace=replace,
                    return_index=True)
            else:
                # all alternatives have a zero weight
                sampled_index = zeros((index1.size, 0), dtype=DTYPE)
            #return index2[sampled_index]

        if rank_of_weight == 2:
            sampled_index = zeros((index1.size, J), dtype=DTYPE) - 1

            for i in range(index1.size):
                replace = with_replacement  # sampling with/without replacement
                i_prob = prob[i, :]
                if nonzerocounts(i_prob) < J:
                    logger.log_warning(
                        "weight array dosen't have enough non-zero counts, use sample with replacement"
                    )
                    replace = True

                #exclude_index passed to probsample_noreplace needs to be indexed to index2
                sampled_index[i, :] = probsample_noreplace(
                    index2,
                    sample_size=J,
                    prob_array=i_prob,
                    exclude_index=chosen_choice_index_to_index2[i],
                    return_index=True)
        sampling_prob = take(prob, sampled_index)
        sampled_index_within_prob = sampled_index.copy()
        sampled_index = index2[sampled_index]
        is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
        #chosen_choice = -1 * ones(chosen_choice_index.size, dtype="int32")
        if include_chosen_choice:
            sampled_index = column_stack(
                (chosen_choice_index[:, newaxis], sampled_index))
            is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
            is_chosen_choice[chosen_choice_index != UNPLACED_ID, 0] = 1
            #chosen_choice[where(is_chosen_choice)[0]] = where(is_chosen_choice)[1]
            ## this is necessary because prob is indexed to index2, not to the choice set (as is chosen_choice_index)
            sampling_prob_for_chosen_choices = take(
                prob, chosen_choice_index_to_index2[:, newaxis])
            ## if chosen choice chosen equals unplaced_id then the sampling prob is 0
            sampling_prob_for_chosen_choices[where(
                chosen_choice_index == UNPLACED_ID)[0], ] = 0.0
            sampling_prob = column_stack(
                [sampling_prob_for_chosen_choices, sampling_prob])

        interaction_dataset = self.create_interaction_dataset(
            dataset1, dataset2, index1, sampled_index)
        interaction_dataset.add_attribute(sampling_prob,
                                          '__sampling_probability')
        interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice')

        if local_resources.get("include_mnl_bias_correction_term", False):
            if include_chosen_choice:
                sampled_index_within_prob = column_stack(
                    (chosen_choice_index_to_index2[:, newaxis],
                     sampled_index_within_prob))
            interaction_dataset.add_mnl_bias_correction_term(
                prob, sampled_index_within_prob)

        ## to get the older returns
        #sampled_index = interaction_dataset.get_2d_index()
        #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32")
        #where_chosen = where(interaction_dataset.get_attribute("chosen_choice"))
        #chosen_choices[where_chosen[0]]=where_chosen[1]
        #return (sampled_index, chosen_choice)

        return interaction_dataset
    def run(self, dataset1, dataset2, index1=None, index2=None, sample_size=10, weight=None,
            include_chosen_choice=False, with_replacement=False, resources=None, dataset_pool=None):
        
        """this function samples number of sample_size (scalar value) alternatives from dataset2
        for agent set specified by dataset1.
        If index1 is not None, only samples alterantives for agents with indices in index1;
        if index2 is not None, only samples alternatives from indices in index2.
        sample_size specifies number of alternatives to be sampled for each agent.
        weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d
        array of the same length as index2 or 2d array of shape (index1.size, index2.size).

        Also refer to document of interaction_dataset"""

        if dataset_pool is None:
            try:
                sc = SessionConfiguration()
                dataset_pool=sc.get_dataset_pool()
            except:
                dataset_pool = DatasetPool()
        
        local_resources = Resources(resources)
        local_resources.merge_if_not_None(
                {"dataset1": dataset1, "dataset2": dataset2,
                "index1":index1, "index2": index2,
                "sample_size": sample_size, "weight": weight,
                "with_replacement": with_replacement,
                "include_chosen_choice": include_chosen_choice})

        local_resources.check_obligatory_keys(['dataset1', 'dataset2', 'sample_size'])
        agent = local_resources["dataset1"]
        index1 = local_resources.get("index1", None)
        if index1 is None:
            index1 = arange(agent.size())
        choice = local_resources["dataset2"]
        index2 = local_resources.get("index2", None)
        if index2 is None:
            index2 = arange(choice.size())
            
        if index1.size == 0 or index2.size == 0:
            err_msg = "either choice size or agent size is zero, return None"
            logger.log_warning(err_msg)
            return None
        
        include_chosen_choice = local_resources.get("include_chosen_choice",  False)
        J = local_resources["sample_size"]
        if include_chosen_choice:
            J = J - 1
            
        with_replacement = local_resources.get("with_replacement")
            
        weight = local_resources.get("weight", None)
        if isinstance(weight, str):
            if weight in choice.get_known_attribute_names():
                weight=choice.get_attribute(weight)
                rank_of_weight = 1 
            elif VariableName(weight).get_dataset_name() == choice.get_dataset_name():
                weight=choice.compute_variables(weight, dataset_pool=dataset_pool)
                rank_of_weight = 1
            else:
                ## weights can be an interaction variable
                interaction_dataset = InteractionDataset(local_resources)
                weight=interaction_dataset.compute_variables(weight, dataset_pool=dataset_pool)
                rank_of_weight = 2
        elif isinstance(weight, ndarray):
            rank_of_weight = weight.ndim
        elif not weight:  ## weight is None or empty string
            weight = ones(index2.size)
            rank_of_weight = 1
        else:
            err_msg = "unkown weight type"
            logger.log_error(err_msg)
            raise TypeError, err_msg

        if (weight.size <> index2.size) and (weight.shape[rank_of_weight-1] <> index2.size):
            if weight.shape[rank_of_weight-1] == choice.size():
                if rank_of_weight == 1:
                    weight = take(weight, index2)
                if rank_of_weight == 2:
                    weight = take(weight, index2, axis=1)
            else:
                err_msg = "weight array size doesn't match to size of dataset2 or its index"
                logger.log_error(err_msg)
                raise ValueError, err_msg

        prob = normalize(weight)

        #chosen_choice = ones(index1.size) * UNPLACED_ID
        chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1]
        #index_of_placed_agent = where(greater(chosen_choice_id, UNPLACED_ID))[0]
        chosen_choice_index = choice.try_get_id_index(chosen_choice_id, return_value_if_not_found=UNPLACED_ID)
        chosen_choice_index_to_index2 = lookup(chosen_choice_index, index2, index_if_not_found=UNPLACED_ID)
        
        if rank_of_weight == 1: # if weight_array is 1d, then each agent shares the same weight for choices
            replace = with_replacement           # sampling with no replacement 
            if nonzerocounts(weight) < J:
                logger.log_warning("weight array dosen't have enough non-zero counts, use sample with replacement")
                replace = True
            sampled_index = prob2dsample( index2, sample_size=(index1.size, J),
                                        prob_array=prob, exclude_index=chosen_choice_index_to_index2,
                                        replace=replace, return_index=True )
            #return index2[sampled_index]

        if rank_of_weight == 2:
            sampled_index = zeros((index1.size,J), dtype="int32") - 1
                
            for i in range(index1.size):
                replace = with_replacement          # sampling with/without replacement
                i_prob = prob[i,:]
                if nonzerocounts(i_prob) < J:
                    logger.log_warning("weight array dosen't have enough non-zero counts, use sample with replacement")
                    replace = True

                #exclude_index passed to probsample_noreplace needs to be indexed to index2
                sampled_index[i,:] = probsample_noreplace( index2, sample_size=J, prob_array=i_prob,
                                                     exclude_index=chosen_choice_index_to_index2[i],
                                                     return_index=True )
        sampling_prob = take(prob, sampled_index)
        sampled_index = index2[sampled_index]
        is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
        #chosen_choice = -1 * ones(chosen_choice_index.size, dtype="int32")
        if include_chosen_choice:
            sampled_index = column_stack((chosen_choice_index[:,newaxis],sampled_index))
            is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
            is_chosen_choice[chosen_choice_index!=UNPLACED_ID, 0] = 1
            #chosen_choice[where(is_chosen_choice)[0]] = where(is_chosen_choice)[1]
            ## this is necessary because prob is indexed to index2, not to the choice set (as is chosen_choice_index)
            sampling_prob_for_chosen_choices = take(prob, chosen_choice_index_to_index2[:, newaxis])
            ## if chosen choice chosen equals unplaced_id then the sampling prob is 0
            sampling_prob_for_chosen_choices[where(chosen_choice_index==UNPLACED_ID)[0],] = 0.0
            sampling_prob = column_stack([sampling_prob_for_chosen_choices, sampling_prob])
        
        interaction_dataset = self.create_interaction_dataset(dataset1, dataset2, index1, sampled_index)
        interaction_dataset.add_attribute(sampling_prob, '__sampling_probability')
        interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice')
        
        ## to get the older returns
        #sampled_index = interaction_dataset.get_2d_index()
        #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32") 
        #where_chosen = where(interaction_dataset.get_attribute("chosen_choice"))
        #chosen_choices[where_chosen[0]]=where_chosen[1]
        #return (sampled_index, chosen_choice)
        
        return interaction_dataset
Exemplo n.º 11
0
    def run(self,
            dataset1,
            dataset2,
            index1=None,
            index2=None,
            stratum=None,
            weight=None,
            sample_size=1,
            sample_size_from_each_stratum=None,
            sample_size_from_chosen_stratum=None,
            sample_rate=None,
            include_chosen_choice=False,
            resources=None,
            with_replacement=False,
            dataset_pool=None,
            **kwargs):
        """this function samples number of sample_size (scalar value) alternatives from dataset2
        for agent set specified by dataset1.
        If index1 is not None, only samples alternatives for agents with indices in index1;
        if index2 is not None, only samples alternatives from indices in index2.
        sample_size specifies number of alternatives to be sampled from each stratum, and is overwritten
          by sample_size_from_each_stratum if it's not None
        weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d
        array of the same length as index2 or 2d array of shape (index1.size, index2.size).

        Also refer to document of interaction_dataset"""
        if dataset_pool is None:
            try:
                sc = SessionConfiguration()
                dataset_pool = sc.get_dataset_pool()
            except:
                dataset_pool = DatasetPool()

        local_resources = Resources(resources)
        local_resources.merge_if_not_None({
            "dataset1":
            dataset1,
            "dataset2":
            dataset2,
            "index1":
            index1,
            "index2":
            index2,
            "with_replacement":
            with_replacement,
            "stratum":
            stratum,
            "weight":
            weight,
            "sample_size":
            sample_size,
            "sample_size_from_each_stratum":
            sample_size_from_each_stratum,
            "sample_size_from_chosen_stratum":
            sample_size_from_chosen_stratum,
            "sample_rate":
            sample_rate,
            "include_chosen_choice":
            include_chosen_choice
        })

        local_resources.check_obligatory_keys(['dataset1', 'dataset2'])
        index1 = local_resources.get("index1", None)

        agent = dataset1

        if index1 is None:
            agent.get_id_attribute()
            index1 = arange(agent.size())

        choice = local_resources["dataset2"]
        index2 = local_resources.get("index2", None)

        if index2 is None:
            choice.get_id_attribute()
            index2 = arange(choice.size())

        if index1.size == 0 or index2.size == 0:
            err_msg = "either choice size or agent size is zero, return None"
            logger.log_warning(err_msg)
            return (None, None)

        include_chosen_choice = local_resources.get("include_chosen_choice",
                                                    False)
        weight = local_resources.get("weight", None)

        if isinstance(weight, str):
            choice.compute_variables(weight, resources=local_resources)
            weight = choice.get_attribute(weight)
            rank_of_weight = 1
        elif isinstance(weight, ndarray):
            rank_of_weight = weight.ndim
        elif weight is None:
            weight = ones(index2.size)
            rank_of_weight = 1
        else:
            err_msg = "unknown weight type"
            logger.log_error(err_msg)
            raise TypeError, err_msg

        if (weight.size <> index2.size) and (weight.shape[rank_of_weight - 1]
                                             <> index2.size):
            if weight.shape[rank_of_weight - 1] == choice.size():
                weight = take(weight, index2)
            else:
                err_msg = "weight array size doesn't match to size of dataset2 or its index"
                logger.log_error(err_msg)
                raise ValueError, err_msg

        prob = normalize(weight)

        stratum = local_resources.get("stratum", None)
        if stratum is None:
            raise StandardError, "'stratum' must be defined for stratified sampling."
        if isinstance(stratum, str):
            choice.compute_variables(stratum, resources=local_resources)
            stratum = choice.get_attribute(stratum)

        #chosen_choice = ones(index1.size) * UNPLACED_ID
        chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1]
        #index_of_placed_agent = where(greater(chosen_choice_id, UNPLACED_ID))[0]
        chosen_choice_index = choice.try_get_id_index(
            chosen_choice_id, return_value_if_not_found=-1)
        chosen_choice_index_to_index2 = lookup(chosen_choice_index,
                                               index2,
                                               index_if_not_found=UNPLACED_ID)

        ##TODO: check all chosen strata are in selectable strata
        #i.e. chosen_choice_index is in index2
        chosen_stratum = ones(chosen_choice_index.size,
                              dtype=DTYPE) * NO_STRATUM_ID
        chosen_stratum[where(
            chosen_choice_index != -1)] = stratum[chosen_choice_index[where(
                chosen_choice_index != -1)]]
        selectable_strata = stratum[index2]
        unique_strata = unique(selectable_strata)
        unique_strata = unique_strata[where(unique_strata != NO_STRATUM_ID)]

        #        if rank_of_weight == 2:
        #            raise RuntimeError, "stratified sampling for 2d weight is unimplemented yet"

        #        sampled_index = zeros((index1.size,1)) - 1

        sample_size = local_resources.get("sample_size", None)
        sample_size_from_each_stratum = local_resources.get(
            "sample_size_from_each_stratum", None)
        if sample_size_from_each_stratum is None:
            sample_size_from_each_stratum = sample_size
        strata_sample_size = ones(unique_strata.size,
                                  dtype=DTYPE) * sample_size_from_each_stratum
        sample_rate = local_resources.get("sample_rate", None)
        if sample_rate is not None:
            raise UnImplementedError, "sample_rate is not implemented yet."
            ##TODO: to be finished
            #num_elements_in_strata = histogram(selectable_strata, unique_strata)
            #strata_sample_size = round(num_elements_in_strata * sample_rate)

        sample_size_from_chosen_stratum = local_resources.get(
            "sample_size_from_chosen_stratum", None)
        if sample_size_from_chosen_stratum is None and not include_chosen_choice:
            strata_sample_pairs = array(
                map(lambda x, y: [x, y], unique_strata, strata_sample_size))
            if rank_of_weight == 1:
                sampled_index = self._sample_by_stratum(
                    index1, index2, selectable_strata, prob,
                    chosen_choice_index_to_index2, strata_sample_pairs)
            elif rank_of_weight == 2:
                sampled_index = self._sample_by_agent_and_stratum(
                    index1, index2, selectable_strata, prob,
                    chosen_choice_index_to_index2, strata_sample_pairs)
        else:
            strata_sample_setting = zeros((index1.size, unique_strata.size, 2),
                                          dtype=DTYPE)
            for i in range(index1.size):
                agents_strata_sample_size = copy.copy(strata_sample_size)
                if sample_size_from_chosen_stratum is None:
                    ## if sample_size_from_chosen_stratum is None and include_chosen_choice is True,
                    ## sample one less from the chosen stratum
                    agents_strata_sample_size[where(
                        unique_strata == chosen_stratum[i])] += -1
                else:
                    agents_strata_sample_size[where(
                        unique_strata ==
                        chosen_stratum[i])] = sample_size_from_chosen_stratum
                strata_sample_pairs = array(
                    map(lambda x, y: [x, y], unique_strata,
                        agents_strata_sample_size))
                strata_sample_setting[i, ...] = strata_sample_pairs

            sampled_index = self._sample_by_agent_and_stratum(
                index1, index2, selectable_strata, prob,
                chosen_choice_index_to_index2, strata_sample_setting)
        #chosen_choice = None
        is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
        if include_chosen_choice:
            sampled_index = concatenate(
                (chosen_choice_index[:, newaxis], sampled_index), axis=1)
            #chosen_choice = zeros(chosen_choice_index.shape, dtype="int32") - 1
            #chosen_choice[where(chosen_choice_index>UNPLACED_ID)] = 0 #make chosen_choice index to sampled_index, instead of choice (as chosen_choice_index does)
            #since the chosen choice index is attached to the first column, the chosen choice should be all zeros
            #for valid chosen_choice_index
            is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
            is_chosen_choice[chosen_choice_index != UNPLACED_ID, 0] = 1

            chosen_probability = zeros(
                (chosen_choice_index.size, ), dtype=float32) - 1
            for stratum in unique_strata:
                w = chosen_stratum == stratum
                chosen_probability[w] = (
                    prob[chosen_choice_index[w]] /
                    prob[selectable_strata == stratum].sum()).astype(float32)
            self._sampling_probability = concatenate(
                (chosen_probability[:, newaxis], self._sampling_probability),
                axis=1)
            self._stratum_id = concatenate(
                (chosen_stratum[:, newaxis], self._stratum_id), axis=1)

        interaction_dataset = self.create_interaction_dataset(
            dataset1, dataset2, index1, sampled_index)
        interaction_dataset.add_attribute(self._sampling_probability,
                                          '__sampling_probability')
        interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice')
        interaction_dataset.add_attribute(self._stratum_id, 'stratum_id')

        ## to get the older returns
        #sampled_index = interaction_dataset.get_2d_index()
        #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32")
        #where_chosen = where(interaction_dataset.get_attribute("chosen_choice"))
        #chosen_choices[where_chosen[0]]=where_chosen[1]
        #return (sampled_index, chosen_choice)

        return interaction_dataset
Exemplo n.º 12
0
    def run(self, dataset1, dataset2, index1=None, index2=None, stratum=None, weight=None,
            sample_size=1, sample_size_from_each_stratum=None, sample_size_from_chosen_stratum=None, sample_rate=None,
            include_chosen_choice=False, resources=None, with_replacement=False, dataset_pool=None, **kwargs):
        """this function samples number of sample_size (scalar value) alternatives from dataset2
        for agent set specified by dataset1.
        If index1 is not None, only samples alternatives for agents with indices in index1;
        if index2 is not None, only samples alternatives from indices in index2.
        sample_size specifies number of alternatives to be sampled from each stratum, and is overwritten
          by sample_size_from_each_stratum if it's not None
        weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d
        array of the same length as index2 or 2d array of shape (index1.size, index2.size).

        Also refer to document of interaction_dataset"""
        if dataset_pool is None:
            try:
                sc = SessionConfiguration()
                dataset_pool=sc.get_dataset_pool()
            except:
                dataset_pool = DatasetPool()
                        
        local_resources = Resources(resources)
        local_resources.merge_if_not_None(
                {"dataset1": dataset1, "dataset2": dataset2,
                "index1":index1, "index2": index2,
                "with_replacement": with_replacement,
                "stratum":stratum, "weight": weight,
                "sample_size": sample_size,
                "sample_size_from_each_stratum": sample_size_from_each_stratum,
                "sample_size_from_chosen_stratum": sample_size_from_chosen_stratum,
                
                "sample_rate": sample_rate,
                "include_chosen_choice": include_chosen_choice})

        local_resources.check_obligatory_keys(['dataset1', 'dataset2'])
        index1 = local_resources.get("index1", None)

        agent = dataset1

        if index1 is None:
            agent.get_id_attribute()
            index1 = arange(agent.size())

        choice = local_resources["dataset2"]
        index2 = local_resources.get("index2", None)

        if index2 is None:
            choice.get_id_attribute()
            index2 = arange(choice.size())

        if index1.size == 0 or index2.size == 0:
            err_msg = "either choice size or agent size is zero, return None"
            logger.log_warning(err_msg)
            return (None, None)

        include_chosen_choice = local_resources.get("include_chosen_choice",  False)
        weight = local_resources.get("weight", None)

        if isinstance(weight, str):
            choice.compute_variables(weight,
                resources = local_resources )
            weight=choice.get_attribute(weight)
            rank_of_weight = 1
        elif isinstance(weight, ndarray):
            rank_of_weight = weight.ndim
        elif weight is None:
            weight = ones(index2.size)
            rank_of_weight = 1
        else:
            err_msg = "unknown weight type"
            logger.log_error(err_msg)
            raise TypeError, err_msg

        if (weight.size <> index2.size) and (weight.shape[rank_of_weight-1] <> index2.size):
            if weight.shape[rank_of_weight-1] == choice.size():
                weight = take(weight, index2)
            else:
                err_msg = "weight array size doesn't match to size of dataset2 or its index"
                logger.log_error(err_msg)
                raise ValueError, err_msg

        prob = normalize(weight)

        stratum = local_resources.get("stratum", None)
        if stratum is None:
            raise StandardError, "'stratum' must be defined for stratified sampling."
        if isinstance(stratum, str):
            choice.compute_variables(stratum,
                resources = local_resources )
            stratum=choice.get_attribute(stratum)

        #chosen_choice = ones(index1.size) * UNPLACED_ID
        chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1]
        #index_of_placed_agent = where(greater(chosen_choice_id, UNPLACED_ID))[0]
        chosen_choice_index = choice.try_get_id_index(chosen_choice_id, return_value_if_not_found=-1)
        chosen_choice_index_to_index2 = lookup(chosen_choice_index, index2, index_if_not_found=UNPLACED_ID)
        
        ##TODO: check all chosen strata are in selectable strata
        #i.e. chosen_choice_index is in index2
        chosen_stratum = ones(chosen_choice_index.size, dtype=DTYPE) * NO_STRATUM_ID
        chosen_stratum[where(chosen_choice_index!=-1)] = stratum[chosen_choice_index[where(chosen_choice_index!=-1)]]
        selectable_strata = stratum[index2]
        unique_strata = unique(selectable_strata)
        unique_strata = unique_strata[where(unique_strata!=NO_STRATUM_ID)]

#        if rank_of_weight == 2:
#            raise RuntimeError, "stratified sampling for 2d weight is unimplemented yet"

#        sampled_index = zeros((index1.size,1)) - 1

        sample_size = local_resources.get("sample_size", None)
        sample_size_from_each_stratum = local_resources.get("sample_size_from_each_stratum", None)
        if sample_size_from_each_stratum is None:
            sample_size_from_each_stratum = sample_size
        strata_sample_size = ones(unique_strata.size, dtype=DTYPE) * sample_size_from_each_stratum
        sample_rate = local_resources.get("sample_rate", None)
        if sample_rate is not None:
            raise UnImplementedError, "sample_rate is not implemented yet."
            ##TODO: to be finished
            #num_elements_in_strata = histogram(selectable_strata, unique_strata)
            #strata_sample_size = round(num_elements_in_strata * sample_rate)

        sample_size_from_chosen_stratum = local_resources.get("sample_size_from_chosen_stratum", None)
        if sample_size_from_chosen_stratum is None and not include_chosen_choice:
            strata_sample_pairs = array(map(lambda x,y: [x,y], unique_strata, strata_sample_size))
            if rank_of_weight == 1:
                sampled_index = self._sample_by_stratum(index1, index2, selectable_strata, prob,
                                                        chosen_choice_index_to_index2, strata_sample_pairs)
            elif rank_of_weight == 2:
                sampled_index = self._sample_by_agent_and_stratum(index1, index2, selectable_strata, prob,
                                                                  chosen_choice_index_to_index2, strata_sample_pairs)
        else:
            strata_sample_setting = zeros((index1.size,unique_strata.size,2), dtype=DTYPE)
            for i in range(index1.size):
                agents_strata_sample_size = copy.copy(strata_sample_size)
                if sample_size_from_chosen_stratum is None:
                    ## if sample_size_from_chosen_stratum is None and include_chosen_choice is True, 
                    ## sample one less from the chosen stratum
                    agents_strata_sample_size[where(unique_strata==chosen_stratum[i])] += - 1
                else:
                    agents_strata_sample_size[where(unique_strata==chosen_stratum[i])] = sample_size_from_chosen_stratum
                strata_sample_pairs = array(map(lambda x,y: [x,y], unique_strata, agents_strata_sample_size))
                strata_sample_setting[i,...] = strata_sample_pairs

            sampled_index = self._sample_by_agent_and_stratum(index1, index2, selectable_strata, prob,
                                                              chosen_choice_index_to_index2, strata_sample_setting)
        #chosen_choice = None
        is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
        if include_chosen_choice:
            sampled_index = concatenate((chosen_choice_index[:,newaxis],sampled_index), axis=1)
            #chosen_choice = zeros(chosen_choice_index.shape, dtype="int32") - 1
            #chosen_choice[where(chosen_choice_index>UNPLACED_ID)] = 0 #make chosen_choice index to sampled_index, instead of choice (as chosen_choice_index does)
                                                                      #since the chosen choice index is attached to the first column, the chosen choice should be all zeros
                                                                      #for valid chosen_choice_index
            is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
            is_chosen_choice[chosen_choice_index!=UNPLACED_ID, 0] = 1
            
            chosen_probability = zeros((chosen_choice_index.size,),dtype=float32) - 1
            for stratum in unique_strata:
                w = chosen_stratum==stratum
                chosen_probability[w] = (prob[chosen_choice_index[w]] / prob[selectable_strata==stratum].sum()).astype(float32)
            self._sampling_probability = concatenate((chosen_probability[:,newaxis], self._sampling_probability), axis=1)
            self._stratum_id = concatenate((chosen_stratum[:,newaxis], self._stratum_id), axis=1)

        interaction_dataset = self.create_interaction_dataset(dataset1, dataset2, index1, sampled_index)
        interaction_dataset.add_attribute(self._sampling_probability, '__sampling_probability')
        interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice')
        interaction_dataset.add_attribute(self._stratum_id, 'stratum_id')

        ## to get the older returns
        #sampled_index = interaction_dataset.get_2d_index()
        #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32") 
        #where_chosen = where(interaction_dataset.get_attribute("chosen_choice"))
        #chosen_choices[where_chosen[0]]=where_chosen[1]
        #return (sampled_index, chosen_choice)
        
        return interaction_dataset    
    def run(self, data, upc_sequence, resources=None):

        self.mnl_probabilities = upc_sequence.probability_class
        self.bhhh_estimation = bhhh_mnl_estimation()

        modified_upc_sequence = UPCFactory().get_model(
            utilities=None,
            probabilities="opus_core.mnl_probabilities",
            choices=None)
        modified_upc_sequence.utility_class = upc_sequence.utility_class

        N, neqs, V = data.shape

        max_iter = resources.get("max_iterations", 100)  # default
        sc = SessionConfiguration()
        dataset_pool = sc.get_dataset_pool()
        sample_rate = dataset_pool.get_dataset("sample_rate")

        CLOSE = sc["CLOSE"]
        info_filename = sc["info_file"]
        info_filename = os.path.join('.', info_filename)
        info_file = open(info_filename, "a")
        constraint_dict = {1: 'constrained', 0: 'unconstrained'}
        swing_cases_fix = 0  #set swing alternatives to constrained (1) or unconstrained (0)
        prob_correlation = None

        choice_set = resources['_model_'].choice_set
        J = choice_set.size()
        alt_id = choice_set.get_id_attribute()
        movers = choice_set.get_attribute('movers')

        resources.check_obligatory_keys(["capacity_string"])
        supply = choice_set.get_attribute(resources["capacity_string"])

        index = resources.get("index", None)
        if index is None:  # no sampling case, alternative set is the full choice_set
            index = arange(J)
        if index.ndim <= 1:
            index = repeat(index[newaxis, :], N, axis=0)

        if resources.get('aggregate_to_dataset', None):
            aggregate_dataset = dataset_pool.get_dataset(
                resources.get('aggregate_to_dataset'))
            choice_set_aggregate_id = choice_set.get_attribute(
                aggregate_dataset.get_id_name()[0])
            index = aggregate_dataset.get_id_index(
                choice_set_aggregate_id[index].ravel()).reshape(index.shape)

            supply = aggregate_dataset.get_attribute(
                resources["capacity_string"])
            J = aggregate_dataset.size()

            movers = aggregate_dataset.get_attribute("movers")

        demand_history = movers[:, newaxis]
        resources.merge({"index": index})

        pi = ones(index.shape, dtype=float32)  #initialize pi
        #average_omega = ones(J,dtype=float32)  #initialize average_omega
        logger.start_block('Outer Loop')
        for i in range(max_iter):
            logger.log_status('Outer Loop Iteration %s' % i)

            result = self.bhhh_estimation.run(data, modified_upc_sequence,
                                              resources)
            del self.bhhh_estimation
            collect()
            self.bhhh_estimation = bhhh_mnl_estimation()

            probability = modified_upc_sequence.get_probabilities()
            if data.shape[2] == V:  #insert a placeholder for ln(pi) in data
                data = concatenate((data, ones((N, neqs, 1), dtype=float32)),
                                   axis=2)
                coef_names = resources.get("coefficient_names")
                coef_names = concatenate((coef_names, array(["ln_pi"])))
                resources.merge({"coefficient_names": coef_names})
            else:
                beta_ln_pi = result['estimators'][where(
                    coef_names == 'ln_pi')][0]
                logger.log_status("mu = 1/%s = %s" %
                                  (beta_ln_pi, 1 / beta_ln_pi))

                prob_hat = safe_array_divide(probability, pi**beta_ln_pi)
                #prob_hat = safe_array_divide(probability, pi)
                prob_hat_sum = prob_hat.sum(axis=1, dtype=float32)
                if not ma.allclose(prob_hat_sum, 1.0):
                    logger.log_status(
                        "probability doesn't sum up to 1, with minimum %s, and maximum %s"
                        % (prob_hat_sum.min(), prob_hat_sum.max()))

                    probability = normalize(prob_hat)

            demand = self.mnl_probabilities.get_demand(index, probability,
                                                       J) * 1 / sample_rate
            demand_history = concatenate((demand_history, demand[:, newaxis]),
                                         axis=1)

            sdratio = safe_array_divide(
                supply, demand, return_value_if_denominator_is_zero=2.0)
            sdratio_matrix = sdratio[index]
            ## debug info
            from numpy import histogram
            from opus_core.misc import unique
            cc = histogram(index.ravel(), unique(index.ravel()))[0]
            logger.log_status(
                "================================================================="
            )
            logger.log_status("Probability min: %s, max: %s" %
                              (probability.min(), probability.max()))
            logger.log_status("Demand min: %s, max: %s" %
                              (demand.min(), demand.max()))
            logger.log_status("sdratio min: %s, max: %s" %
                              (sdratio.min(), sdratio.max()))
            logger.log_status("demand[sdratio==sdratio.min()]=%s" %
                              demand[sdratio == sdratio.min()])
            logger.log_status("demand[sdratio==sdratio.max()]=%s" %
                              demand[sdratio == sdratio.max()])
            logger.log_status(
                "Counts of unique submarkets in alternatives min: %s, max: %s"
                % (cc.min(), cc.max()))
            logger.log_status(
                "================================================================="
            )

            constrained_locations_matrix, omega, info = self.inner_loop(
                supply,
                demand,
                probability,
                index,
                sdratio_matrix,
                J,
                max_iteration=max_iter)

            inner_iterations, constrained_locations_history, swing_index, average_omega_history = info

            for idx in swing_index:
                logger.log_status(
                    "swinging alt with id %s set to %s" %
                    (alt_id[idx], constraint_dict[swing_cases_fix]))
                constrained_locations_matrix[index == idx] = swing_cases_fix

            if swing_index.size > 0:
                info_file.write("swing of constraints found with id %s \n" %
                                alt_id[swing_index])
                info_file.write("outer_iteration, %i, " % i +
                                ", ".join([str(i)] *
                                          (len(inner_iterations))) + "\n")
                info_file.write("inner_iteration, , " +
                                ", ".join(inner_iterations) + "\n")
                info_file.write("id, sdratio, " +
                                ", ".join(["avg_omega"] *
                                          len(inner_iterations)) + "\n")
                for idx in swing_index:
                    line = str(alt_id[idx]) + ','
                    line += str(sdratio[idx]) + ','
                    line += ",".join(
                        [str(x) for x in average_omega_history[idx, ]])
                    line += "\n"
                    info_file.write(line)

                info_file.write("\n")
                info_file.flush()

            outer_iterations = [str(i)] * len(inner_iterations)
            prob_min = [str(probability.min())] * len(inner_iterations)
            prob_max = [str(probability.max())] * len(inner_iterations)

            pi_new = self.mnl_probabilities.get_pi(
                sdratio_matrix, omega, constrained_locations_matrix)

            data[:, :, -1] = ln(pi_new)
            #diagnostic output

            if not ma.allclose(pi, pi_new, atol=CLOSE):
                if i > 0:  #don't print this for the first iteration
                    logger.log_status("min of abs(pi(l+1) - pi(l)): %s" %
                                      absolute(pi_new - pi).min())
                    logger.log_status("max of abs(pi(l+1) - pi(l)): %s" %
                                      absolute(pi_new - pi).max())
                    logger.log_status("mean of pi(l+1) - pi(l): %s" %
                                      (pi_new - pi).mean())
                    logger.log_status(
                        'Standard Deviation pi(l+1) - pi(l): %s' %
                        standard_deviation(pi_new - pi))
                    logger.log_status('correlation of pi(l+1) and pi(l): %s' %
                                      corr(pi_new.ravel(), pi.ravel())[0, 1])

                pi = pi_new
                probability_old = probability  # keep probability of the previous loop, for statistics computation only
            else:  #convergence criterion achieved, quiting outer loop
                logger.log_status(
                    "pi(l) == pi(l+1): Convergence criterion achieved")

                info_file.write("\nConstrained Locations History:\n")
                info_file.write("outer_iteration," +
                                ",".join(outer_iterations) + "\n")
                info_file.write("inner_iteration," +
                                ",".join(inner_iterations) + "\n")
                info_file.write("minimum_probability," + ",".join(prob_min) +
                                "\n")
                info_file.write("maximum_probability," + ",".join(prob_max) +
                                "\n")
                for row in range(J):
                    line = [
                        str(x) for x in constrained_locations_history[row, ]
                    ]
                    info_file.write(
                        str(alt_id[row]) + "," + ",".join(line) + "\n")

                info_file.flush()

                info_file.write("\nDemand History:\n")
                i_str = [str(x) for x in range(i)]
                info_file.write("outer_iteration, (movers)," +
                                ",".join(i_str) + "\n")
                #info_file.write(", ,\n")
                for row in range(J):
                    line = [str(x) for x in demand_history[row, ]]
                    info_file.write(
                        str(alt_id[row]) + "," + ",".join(line) + "\n")

                demand_history_info_criteria = [500, 100, 50, 20]
                for criterion in demand_history_info_criteria:
                    com_rows_index = where(movers <= criterion)[0]
                    info_file.write(
                        "\nDemand History for alternatives with less than or equal to %s movers in 1998:\n"
                        % criterion)
                    i_str = [str(x) for x in range(i)]
                    info_file.write("outer_iteration, (movers)," +
                                    ",".join(i_str) + "\n")
                    #info_file.write(", movers,\n")
                    for row in com_rows_index:
                        line = [str(x) for x in demand_history[row, ]]
                        info_file.write(
                            str(alt_id[row]) + "," + ",".join(line) + "\n")

                #import pdb; pdb.set_trace()
                #export prob correlation history
                correlation_indices, prob_correlation = self.compute_prob_correlation(
                    probability_old, probability, prob_hat, index, resources)

                info_file.write("\nCorrelation of Probabilities:\n")
                c_name = [
                    'corr(p_ij p~_ij)', 'corr(p_ij p^_ij)', 'corr(p_ij dummy)',
                    'corr(p~_ij p^_ij)', 'corr(p~_ij dummy)',
                    'corr(p^_ij dummy)'
                ]

                info_file.write("com_id, " + ",".join(c_name) + "\n")

                #info_file.write(", ,\n")
                for row in range(correlation_indices.size):
                    line = [str(x) for x in prob_correlation[row, ]]
                    info_file.write(
                        str(alt_id[correlation_indices[row]]) + "," +
                        ",".join(line) + "\n")

                info_file.close()

                result['pi'] = pi
                return result

        logger.end_block()
        try:
            info_file.close()
        except:
            pass

        raise RuntimeError, "max iteration reached without convergence."