예제 #1
0
    def _sample_by_agent_and_stratum(self, index1, index2, stratum, prob_array,
                                     chosen_choice_index,
                                     strata_sample_setting):
        """agent by agent and stratum by stratum stratified sampling, suitable for 2d prob_array and/or sample_size varies for agents
        this method is slower than _sample_by_stratum, for simpler stratified sampling use _sample_by_stratum instead"""

        rank_of_prob = rank(prob_array)
        rank_of_strata = rank(strata_sample_setting)

        J = self.__determine_sampled_index_size(strata_sample_setting,
                                                rank_of_strata)
        sampled_index = zeros((index1.size, J), dtype=DTYPE) - 1
        self._sampling_probability = zeros((index1.size, J), dtype=float32)
        self._stratum_id = ones((index1.size, J), dtype=DTYPE) * NO_STRATUM_ID

        for i in range(index1.size):
            if rank_of_strata == 3:
                strata_sample_pairs = strata_sample_setting[i, :]
            else:
                strata_sample_pairs = strata_sample_setting

            if rank_of_prob == 2:
                prob = prob_array[i, :]
            else:
                prob = prob_array

            j = 0
            for (this_stratum, this_size) in strata_sample_pairs:
                if this_size <= 0: continue
                index_not_in_stratum = where(stratum != this_stratum)[0]
                this_prob = copy.copy(prob)

                this_prob[index_not_in_stratum] = 0.0
                this_prob = normalize(this_prob)

                if nonzerocounts(this_prob) < this_size:
                    logger.log_warning(
                        "weight array dosen't have enough non-zero counts, use sample with replacement"
                    )


#                chosen_index_to_index2 = where(index2 == chosen_choice_index[i])[0]
#exclude_index passed to probsample_noreplace needs to be indexed to index2
                this_sampled_index = probsample_noreplace(
                    index2,
                    sample_size=this_size,
                    prob_array=this_prob,
                    exclude_index=chosen_choice_index[i],
                    return_index=True)
                sampled_index[i, j:j + this_size] = this_sampled_index

                self._sampling_probability[
                    i, j:j + this_size] = this_prob[this_sampled_index]
                self._stratum_id[i, j:j + this_size] = ones(
                    (this_sampled_index.size, ), dtype=DTYPE) * this_stratum

                j += this_size

        return index2[sampled_index]
    def _sample_by_agent_and_stratum(
        self, index1, index2, stratum, prob_array, chosen_choice_index, strata_sample_setting
    ):
        """agent by agent and stratum by stratum stratified sampling, suitable for 2d prob_array and/or sample_size varies for agents
        this method is slower than _sample_by_stratum, for simpler stratified sampling use _sample_by_stratum instead"""

        rank_of_prob = rank(prob_array)
        rank_of_strata = rank(strata_sample_setting)

        J = self.__determine_sampled_index_size(strata_sample_setting, rank_of_strata)
        sampled_index = zeros((index1.size, J), dtype="int32") - 1
        self._sampling_probability = zeros((index1.size, J), dtype=float32)
        self._stratum_id = ones((index1.size, J), dtype="int32") * NO_STRATUM_ID

        for i in range(index1.size):
            if rank_of_strata == 3:
                strata_sample_pairs = strata_sample_setting[i, :]
            else:
                strata_sample_pairs = strata_sample_setting

            if rank_of_prob == 2:
                prob = prob_array[i, :]
            else:
                prob = prob_array

            j = 0
            for (this_stratum, this_size) in strata_sample_pairs:
                if this_size <= 0:
                    continue
                index_not_in_stratum = where(stratum != this_stratum)[0]
                this_prob = copy.copy(prob)

                this_prob[index_not_in_stratum] = 0.0
                this_prob = normalize(this_prob)

                if nonzerocounts(this_prob) < this_size:
                    logger.log_warning("weight array dosen't have enough non-zero counts, use sample with replacement")

                #                chosen_index_to_index2 = where(index2 == chosen_choice_index[i])[0]
                # exclude_index passed to probsample_noreplace needs to be indexed to index2
                this_sampled_index = probsample_noreplace(
                    index2,
                    sample_size=this_size,
                    prob_array=this_prob,
                    exclude_index=chosen_choice_index[i],
                    return_index=True,
                )
                sampled_index[i, j : j + this_size] = this_sampled_index

                self._sampling_probability[i, j : j + this_size] = this_prob[this_sampled_index]
                self._stratum_id[i, j : j + this_size] = ones((this_sampled_index.size,), dtype="int32") * this_stratum

                j += this_size

        return index2[sampled_index]
예제 #3
0
    def _sample_by_stratum(self, index1, index2, stratum, prob_array,
                           chosen_choice_index, strata_sample_setting):
        """stratum by stratum stratified sampling, suitable for 1d prob_array and sample_size is the same for all agents"""
        if prob_array.ndim <> 1:
            raise RuntimeError, "_sample_by_stratum only suitable for 1d prob_array"

        sampled_index = zeros((index1.size, 1), dtype=DTYPE) - 1
        self._sampling_probability = zeros((index1.size, 1), dtype=float32)
        self._stratum_id = ones((index1.size, 1), dtype=DTYPE) * NO_STRATUM_ID
        for this_stratum, this_size in strata_sample_setting:
            index_not_in_stratum = where(stratum != this_stratum)[0]
            this_prob = copy.copy(prob_array)

            this_prob[index_not_in_stratum] = 0.0
            this_prob = normalize(this_prob)

            replace = False  # non-repeat sampling
            if nonzerocounts(this_prob) < this_size:
                logger.log_warning(
                    "weight array dosen't have enough non-zero counts, sample with replacement"
                )
                replace = True

            this_sampled_index = prob2dsample(
                index2,
                sample_size=(index1.size, this_size),
                prob_array=this_prob,
                exclude_index=chosen_choice_index,
                replace=replace,
                return_index=True)
            sampled_index = concatenate((sampled_index, this_sampled_index),
                                        axis=1)
            self._sampling_probability = concatenate(
                (self._sampling_probability, this_prob[this_sampled_index]),
                axis=1)
            self._stratum_id = concatenate(
                (self._stratum_id,
                 ones((this_sampled_index.shape[0], 1), dtype=DTYPE) *
                 this_stratum),
                axis=1)

        self._sampling_probability = self._sampling_probability[:, 1:]
        self._stratum_id = self._stratum_id[:, 1:]
        return index2[sampled_index[:, 1:]]
    def _sample_by_stratum(self, index1, index2, stratum, prob_array, chosen_choice_index, strata_sample_setting):
        """stratum by stratum stratified sampling, suitable for 1d prob_array and sample_size is the same for all agents"""
        if prob_array.ndim <> 1:
            raise RuntimeError, "_sample_by_stratum only suitable for 1d prob_array"

        sampled_index = zeros((index1.size, 1), dtype="int32") - 1
        self._sampling_probability = zeros((index1.size, 1), dtype=float32)
        self._stratum_id = ones((index1.size, 1), dtype="int32") * NO_STRATUM_ID
        for this_stratum, this_size in strata_sample_setting:
            index_not_in_stratum = where(stratum != this_stratum)[0]
            this_prob = copy.copy(prob_array)

            this_prob[index_not_in_stratum] = 0.0
            this_prob = normalize(this_prob)

            replace = False  # non-repeat sampling
            if nonzerocounts(this_prob) < this_size:
                logger.log_warning("weight array dosen't have enough non-zero counts, sample with replacement")
                replace = True

            this_sampled_index = prob2dsample(
                index2,
                sample_size=(index1.size, this_size),
                prob_array=this_prob,
                exclude_index=chosen_choice_index,
                replace=replace,
                return_index=True,
            )
            sampled_index = concatenate((sampled_index, this_sampled_index), axis=1)
            self._sampling_probability = concatenate(
                (self._sampling_probability, this_prob[this_sampled_index]), axis=1
            )
            self._stratum_id = concatenate(
                (self._stratum_id, ones((this_sampled_index.shape[0], 1), dtype="int32") * this_stratum), axis=1
            )

        self._sampling_probability = self._sampling_probability[:, 1:]
        self._stratum_id = self._stratum_id[:, 1:]
        return index2[sampled_index[:, 1:]]
예제 #5
0
    def run(self,
            dataset1,
            dataset2,
            index1=None,
            index2=None,
            sample_size=10,
            weight=None,
            include_chosen_choice=False,
            with_replacement=False,
            resources=None,
            dataset_pool=None):
        """this function samples number of sample_size (scalar value) alternatives from dataset2
        for agent set specified by dataset1.
        If index1 is not None, only samples alterantives for agents with indices in index1;
        if index2 is not None, only samples alternatives from indices in index2.
        sample_size specifies number of alternatives to be sampled for each agent.
        weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d
        array of the same length as index2 or 2d array of shape (index1.size, index2.size).

        Also refer to document of interaction_dataset"""

        if dataset_pool is None:
            try:
                sc = SessionConfiguration()
                dataset_pool = sc.get_dataset_pool()
            except:
                dataset_pool = DatasetPool()

        local_resources = Resources(resources)
        local_resources.merge_if_not_None({
            "dataset1":
            dataset1,
            "dataset2":
            dataset2,
            "index1":
            index1,
            "index2":
            index2,
            "sample_size":
            sample_size,
            "weight":
            weight,
            "with_replacement":
            with_replacement,
            "include_chosen_choice":
            include_chosen_choice
        })

        local_resources.check_obligatory_keys(
            ['dataset1', 'dataset2', 'sample_size'])
        agent = local_resources["dataset1"]
        index1 = local_resources.get("index1", None)
        if index1 is None:
            index1 = arange(agent.size())
        choice = local_resources["dataset2"]
        index2 = local_resources.get("index2", None)
        if index2 is None:
            index2 = arange(choice.size())

        if index1.size == 0 or index2.size == 0:
            err_msg = "either choice size or agent size is zero, return None"
            logger.log_warning(err_msg)
            return None

        include_chosen_choice = local_resources.get("include_chosen_choice",
                                                    False)
        J = local_resources["sample_size"]
        if include_chosen_choice:
            J = J - 1

        with_replacement = local_resources.get("with_replacement")

        weight = local_resources.get("weight", None)
        if isinstance(weight, str):
            if weight in choice.get_known_attribute_names():
                weight = choice.get_attribute(weight)
                rank_of_weight = 1
            else:
                varname = VariableName(weight)
                if varname.get_dataset_name() == choice.get_dataset_name():
                    weight = choice.compute_variables(
                        weight, dataset_pool=dataset_pool)
                    rank_of_weight = 1
                elif varname.get_interaction_set_names() is not None:
                    ## weights can be an interaction variable
                    interaction_dataset = InteractionDataset(local_resources)
                    weight = interaction_dataset.compute_variables(
                        weight, dataset_pool=dataset_pool)
                    rank_of_weight = 2
                    assert (len(weight.shape) >= rank_of_weight)
                else:
                    err_msg = ("weight is neither a known attribute name "
                               "nor a simple variable from the choice dataset "
                               "nor an interaction variable: '%s'" % weight)
                    logger.log_error(err_msg)
                    raise ValueError, err_msg
        elif isinstance(weight, ndarray):
            rank_of_weight = weight.ndim
        elif not weight:  ## weight is None or empty string
            weight = ones(index2.size)
            rank_of_weight = 1
        else:
            err_msg = "unkown weight type"
            logger.log_error(err_msg)
            raise TypeError, err_msg

        if (weight.size <> index2.size) and (weight.shape[rank_of_weight - 1]
                                             <> index2.size):
            if weight.shape[rank_of_weight - 1] == choice.size():
                if rank_of_weight == 1:
                    weight = take(weight, index2)
                if rank_of_weight == 2:
                    weight = take(weight, index2, axis=1)
            else:
                err_msg = "weight array size doesn't match to size of dataset2 or its index"
                logger.log_error(err_msg)
                raise ValueError, err_msg

        prob = normalize(weight)

        #chosen_choice = ones(index1.size) * UNPLACED_ID
        chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1]
        #index_of_placed_agent = where(greater(chosen_choice_id, UNPLACED_ID))[0]
        chosen_choice_index = choice.try_get_id_index(
            chosen_choice_id, return_value_if_not_found=UNPLACED_ID)
        chosen_choice_index_to_index2 = lookup(chosen_choice_index,
                                               index2,
                                               index_if_not_found=UNPLACED_ID)

        if rank_of_weight == 1:  # if weight_array is 1d, then each agent shares the same weight for choices
            replace = with_replacement  # sampling with no replacement
            non_zero_counts = nonzerocounts(weight)
            if non_zero_counts < J:
                logger.log_warning(
                    "weight array dosen't have enough non-zero counts, use sample with replacement"
                )
                replace = True
            if non_zero_counts > 0:
                sampled_index = prob2dsample(
                    index2,
                    sample_size=(index1.size, J),
                    prob_array=prob,
                    exclude_index=chosen_choice_index_to_index2,
                    replace=replace,
                    return_index=True)
            else:
                # all alternatives have a zero weight
                sampled_index = zeros((index1.size, 0), dtype=DTYPE)
            #return index2[sampled_index]

        if rank_of_weight == 2:
            sampled_index = zeros((index1.size, J), dtype=DTYPE) - 1

            for i in range(index1.size):
                replace = with_replacement  # sampling with/without replacement
                i_prob = prob[i, :]
                if nonzerocounts(i_prob) < J:
                    logger.log_warning(
                        "weight array dosen't have enough non-zero counts, use sample with replacement"
                    )
                    replace = True

                #exclude_index passed to probsample_noreplace needs to be indexed to index2
                sampled_index[i, :] = probsample_noreplace(
                    index2,
                    sample_size=J,
                    prob_array=i_prob,
                    exclude_index=chosen_choice_index_to_index2[i],
                    return_index=True)
        sampling_prob = take(prob, sampled_index)
        sampled_index_within_prob = sampled_index.copy()
        sampled_index = index2[sampled_index]
        is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
        #chosen_choice = -1 * ones(chosen_choice_index.size, dtype="int32")
        if include_chosen_choice:
            sampled_index = column_stack(
                (chosen_choice_index[:, newaxis], sampled_index))
            is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
            is_chosen_choice[chosen_choice_index != UNPLACED_ID, 0] = 1
            #chosen_choice[where(is_chosen_choice)[0]] = where(is_chosen_choice)[1]
            ## this is necessary because prob is indexed to index2, not to the choice set (as is chosen_choice_index)
            sampling_prob_for_chosen_choices = take(
                prob, chosen_choice_index_to_index2[:, newaxis])
            ## if chosen choice chosen equals unplaced_id then the sampling prob is 0
            sampling_prob_for_chosen_choices[where(
                chosen_choice_index == UNPLACED_ID)[0], ] = 0.0
            sampling_prob = column_stack(
                [sampling_prob_for_chosen_choices, sampling_prob])

        interaction_dataset = self.create_interaction_dataset(
            dataset1, dataset2, index1, sampled_index)
        interaction_dataset.add_attribute(sampling_prob,
                                          '__sampling_probability')
        interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice')

        if local_resources.get("include_mnl_bias_correction_term", False):
            if include_chosen_choice:
                sampled_index_within_prob = column_stack(
                    (chosen_choice_index_to_index2[:, newaxis],
                     sampled_index_within_prob))
            interaction_dataset.add_mnl_bias_correction_term(
                prob, sampled_index_within_prob)

        ## to get the older returns
        #sampled_index = interaction_dataset.get_2d_index()
        #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32")
        #where_chosen = where(interaction_dataset.get_attribute("chosen_choice"))
        #chosen_choices[where_chosen[0]]=where_chosen[1]
        #return (sampled_index, chosen_choice)

        return interaction_dataset
    def run(self, dataset1, dataset2, index1=None, index2=None, sample_size=10, weight=None,
            include_chosen_choice=False, with_replacement=False, resources=None, dataset_pool=None):
        
        """this function samples number of sample_size (scalar value) alternatives from dataset2
        for agent set specified by dataset1.
        If index1 is not None, only samples alterantives for agents with indices in index1;
        if index2 is not None, only samples alternatives from indices in index2.
        sample_size specifies number of alternatives to be sampled for each agent.
        weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d
        array of the same length as index2 or 2d array of shape (index1.size, index2.size).

        Also refer to document of interaction_dataset"""

        if dataset_pool is None:
            try:
                sc = SessionConfiguration()
                dataset_pool=sc.get_dataset_pool()
            except:
                dataset_pool = DatasetPool()
        
        local_resources = Resources(resources)
        local_resources.merge_if_not_None(
                {"dataset1": dataset1, "dataset2": dataset2,
                "index1":index1, "index2": index2,
                "sample_size": sample_size, "weight": weight,
                "with_replacement": with_replacement,
                "include_chosen_choice": include_chosen_choice})

        local_resources.check_obligatory_keys(['dataset1', 'dataset2', 'sample_size'])
        agent = local_resources["dataset1"]
        index1 = local_resources.get("index1", None)
        if index1 is None:
            index1 = arange(agent.size())
        choice = local_resources["dataset2"]
        index2 = local_resources.get("index2", None)
        if index2 is None:
            index2 = arange(choice.size())
            
        if index1.size == 0 or index2.size == 0:
            err_msg = "either choice size or agent size is zero, return None"
            logger.log_warning(err_msg)
            return None
        
        include_chosen_choice = local_resources.get("include_chosen_choice",  False)
        J = local_resources["sample_size"]
        if include_chosen_choice:
            J = J - 1
            
        with_replacement = local_resources.get("with_replacement")
            
        weight = local_resources.get("weight", None)
        if isinstance(weight, str):
            if weight in choice.get_known_attribute_names():
                weight=choice.get_attribute(weight)
                rank_of_weight = 1 
            elif VariableName(weight).get_dataset_name() == choice.get_dataset_name():
                weight=choice.compute_variables(weight, dataset_pool=dataset_pool)
                rank_of_weight = 1
            else:
                ## weights can be an interaction variable
                interaction_dataset = InteractionDataset(local_resources)
                weight=interaction_dataset.compute_variables(weight, dataset_pool=dataset_pool)
                rank_of_weight = 2
        elif isinstance(weight, ndarray):
            rank_of_weight = weight.ndim
        elif not weight:  ## weight is None or empty string
            weight = ones(index2.size)
            rank_of_weight = 1
        else:
            err_msg = "unkown weight type"
            logger.log_error(err_msg)
            raise TypeError, err_msg

        if (weight.size <> index2.size) and (weight.shape[rank_of_weight-1] <> index2.size):
            if weight.shape[rank_of_weight-1] == choice.size():
                if rank_of_weight == 1:
                    weight = take(weight, index2)
                if rank_of_weight == 2:
                    weight = take(weight, index2, axis=1)
            else:
                err_msg = "weight array size doesn't match to size of dataset2 or its index"
                logger.log_error(err_msg)
                raise ValueError, err_msg

        prob = normalize(weight)

        #chosen_choice = ones(index1.size) * UNPLACED_ID
        chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1]
        #index_of_placed_agent = where(greater(chosen_choice_id, UNPLACED_ID))[0]
        chosen_choice_index = choice.try_get_id_index(chosen_choice_id, return_value_if_not_found=UNPLACED_ID)
        chosen_choice_index_to_index2 = lookup(chosen_choice_index, index2, index_if_not_found=UNPLACED_ID)
        
        if rank_of_weight == 1: # if weight_array is 1d, then each agent shares the same weight for choices
            replace = with_replacement           # sampling with no replacement 
            if nonzerocounts(weight) < J:
                logger.log_warning("weight array dosen't have enough non-zero counts, use sample with replacement")
                replace = True
            sampled_index = prob2dsample( index2, sample_size=(index1.size, J),
                                        prob_array=prob, exclude_index=chosen_choice_index_to_index2,
                                        replace=replace, return_index=True )
            #return index2[sampled_index]

        if rank_of_weight == 2:
            sampled_index = zeros((index1.size,J), dtype="int32") - 1
                
            for i in range(index1.size):
                replace = with_replacement          # sampling with/without replacement
                i_prob = prob[i,:]
                if nonzerocounts(i_prob) < J:
                    logger.log_warning("weight array dosen't have enough non-zero counts, use sample with replacement")
                    replace = True

                #exclude_index passed to probsample_noreplace needs to be indexed to index2
                sampled_index[i,:] = probsample_noreplace( index2, sample_size=J, prob_array=i_prob,
                                                     exclude_index=chosen_choice_index_to_index2[i],
                                                     return_index=True )
        sampling_prob = take(prob, sampled_index)
        sampled_index = index2[sampled_index]
        is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
        #chosen_choice = -1 * ones(chosen_choice_index.size, dtype="int32")
        if include_chosen_choice:
            sampled_index = column_stack((chosen_choice_index[:,newaxis],sampled_index))
            is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
            is_chosen_choice[chosen_choice_index!=UNPLACED_ID, 0] = 1
            #chosen_choice[where(is_chosen_choice)[0]] = where(is_chosen_choice)[1]
            ## this is necessary because prob is indexed to index2, not to the choice set (as is chosen_choice_index)
            sampling_prob_for_chosen_choices = take(prob, chosen_choice_index_to_index2[:, newaxis])
            ## if chosen choice chosen equals unplaced_id then the sampling prob is 0
            sampling_prob_for_chosen_choices[where(chosen_choice_index==UNPLACED_ID)[0],] = 0.0
            sampling_prob = column_stack([sampling_prob_for_chosen_choices, sampling_prob])
        
        interaction_dataset = self.create_interaction_dataset(dataset1, dataset2, index1, sampled_index)
        interaction_dataset.add_attribute(sampling_prob, '__sampling_probability')
        interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice')
        
        ## to get the older returns
        #sampled_index = interaction_dataset.get_2d_index()
        #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32") 
        #where_chosen = where(interaction_dataset.get_attribute("chosen_choice"))
        #chosen_choices[where_chosen[0]]=where_chosen[1]
        #return (sampled_index, chosen_choice)
        
        return interaction_dataset