예제 #1
0
    def _calculate_fpc(
        self,
        samp_unit: np.ndarray,
        samp_size: Union[Dict[Any, int], int],
        stratum: np.ndarray,
    ) -> None:

        samp_unit = formats.sample_units(samp_unit, unique=True)
        samp_size = formats.sample_size_dict(samp_size, self.stratification,
                                             stratum)

        self.fpc = dict()
        if self.stratification:
            strata = np.unique(stratum)
            for k, s in enumerate(strata):
                number_units_s = len(samp_unit[stratum == s])
                self.fpc[s] = np.sqrt(
                    (number_units_s - samp_size[s]) / (number_units_s - 1))
        else:
            self.fpc["__none__"] = np.sqrt(
                (samp_unit.size - samp_size["__none__"]) /
                (samp_unit.size - 1))
예제 #2
0
    def _srs_inclusion_probs(
        self,
        samp_unit: np.ndarray,
        samp_size: Union[Dict[Any, int], int],
        stratum: Optional[np.ndarray] = None,
    ) -> np.ndarray:

        samp_unit = formats.sample_units(samp_unit)
        samp_size = formats.sample_size_dict(samp_size, self.stratification,
                                             stratum)

        number_units = samp_unit.size
        if self.stratification:
            incl_probs = np.zeros(number_units) * np.nan
            for s in np.unique(stratum):
                number_units_s = samp_unit[stratum == s].size
                incl_probs[stratum == s] = samp_size[s] / number_units_s
        else:
            number_units = samp_unit.size
            incl_probs = np.ones(
                number_units) * samp_size["__none__"] / number_units

        return incl_probs
예제 #3
0
    def _pps_inclusion_probs(
        self,
        samp_unit: np.ndarray,
        samp_size: Dict[Any, int],
        mos: np.ndarray,
        stratum: Optional[np.ndarray] = None,
    ) -> np.ndarray:

        samp_unit = formats.sample_units(samp_unit, unique=True)
        samp_size = formats.sample_size_dict(samp_size, self.stratification,
                                             stratum)

        if self.stratification:
            number_units = samp_unit.size
            incl_probs = np.zeros(number_units) * np.nan
            for s in np.unique(stratum):
                stratum_units = stratum == s
                mos_s = mos[stratum_units]
                incl_probs[stratum_units] = samp_size[s] * mos_s / np.sum(
                    mos_s)
        else:
            incl_probs = samp_size["__none__"] * mos / np.sum(mos)

        return incl_probs
예제 #4
0
    def select(
        self,
        samp_unit: Array,
        samp_size: Union[Dict[Any, int], int, None] = None,
        stratum: Optional[Array] = None,
        mos: Optional[Array] = None,
        samp_rate: Union[Dict[Any, float], float, None] = None,
        probs: Optional[Array] = None,
        shuffle: bool = False,
        to_dataframe: bool = False,
        sample_only: bool = False,
    ) -> Union[pd.DataFrame, Tuple[np.ndarray, np.ndarray, np.ndarray]]:
        """Selects the random sample. 

        Args:
            samp_unit (Array): an array of all the observations in the target population. 
            samp_size (Union[Dict[Any, int], int, None], optional): the dictionary of sample 
            sizes by stratum, if applicable. Defaults to None.
            stratum (Optional[Array], optional): array of the strata associated to the 
                population units. Defaults to None.
            mos (Optional[Array], optional): array of the measure of size associated to the 
                population units. Defaults to None.
            samp_rate (Union[Dict[Any, float], float, None], optional): sampling rate provided  
                by stratum if applicable. Defaults to None.
            probs (Optional[Array], optional): array of the probability of selection associated to  the population units. Defaults to None.
            shuffle (bool, optional): indicates whether to shuffle the data prior to running the 
                selection algorithm. Defaults to False.
            to_dataframe (bool, optional): indicates whether to convert the output to a pandas 
                dataframe. Defaults to False.
            sample_only (bool, optional): indicates whether to return only the sample without
                the out of sample units. Defaults to False.

        Raises:
            AssertionError: raises an assertion error if both samp_size and samp_rate is 
                provided as input.
            AssertionError: raises an assertion error if some of the clusters are certainties. 

        Returns:
            Union[pd.DataFrame, Tuple[np.ndarray, np.ndarray, np.ndarray]]: [description]
        """

        samp_unit = formats.sample_units(samp_unit, unique=True)

        if stratum is not None:
            stratum = formats.numpy_array(stratum)
        if mos is not None:
            mos = formats.numpy_array(mos)
        if probs is not None:
            probs = formats.numpy_array(probs)

        if samp_size is not None and samp_rate is not None:
            raise AssertionError(
                "Both samp_size and samp_rate are provided. Only one of the two parameters should be specified."
            )

        if samp_size is not None:
            samp_size = formats.sample_size_dict(samp_size,
                                                 self.stratification, stratum)
            samp_size = self._convert_to_dict(samp_size, int)
        if samp_rate is not None:
            samp_rate = self._convert_to_dict(samp_rate, float)

        if shuffle and self.method in ("sys", "pps-sys"):
            suffled_order = np.random.shuffle(range(samp_unit.size))
            samp_unit = samp_unit[suffled_order]
            if stratum is not None:
                stratum = stratum[suffled_order]
            if self.method == "pps-sys" and mos is not None:
                mos = mos[suffled_order]

        if self.method == "srs":
            probs = self._srs_inclusion_probs(samp_unit,
                                              samp_size,
                                              stratum=stratum)
            sample, hits = self._grs_select(probs, samp_unit, samp_size,
                                            stratum)
        elif self.method in (
                "pps-brewer",
                "pps-hv",
                "pps-murphy",
                "pps-rs",
                "pps-sys",
        ):
            if self._anycertainty(samp_size, stratum, mos):
                raise AssertionError("Some clusters are certainties.")
            probs = self.inclusion_probs(samp_unit, samp_size, stratum, mos)
            sample, hits = self._pps_select(samp_unit, samp_size, stratum, mos)
        elif self.method == "sys":
            # probs = self._srs_inclusion_probs(samp_unit, samp_size, stratum)
            sample, hits = self._sys_select(samp_unit, samp_size, stratum,
                                            samp_rate)
        elif self.method == "grs":
            sample, hits = self._grs_select(probs, samp_unit, samp_size,
                                            stratum)

        if shuffle:
            sample = sample[suffled_order]
            hits = hits[suffled_order]

        if sample_only:
            frame = self._to_dataframe(samp_unit, stratum, mos, sample, hits,
                                       probs)
            return frame.loc[frame["_sample"] == 1]
        elif to_dataframe:
            frame = self._to_dataframe(samp_unit, stratum, mos, sample, hits,
                                       probs)
            return frame
        else:
            return sample, hits, probs
예제 #5
0
    def inclusion_probs(
        self,
        samp_unit: Array,
        samp_size: Union[Dict[Any, int], int],
        stratum: Optional[Array] = None,
        mos: Optional[Array] = None,
        samp_rate: Union[Dict[Any, float], float, None] = None,
    ) -> np.ndarray:
        """Computes the inclusion probabilities according to the sampling scheme. 

        Args:
            samp_unit (Array): an array of all the observations in the target population. 
            samp_size (Union[Dict[Any, int], int, None], optional): the dictionary of sample 
            sizes by stratum, if applicable. Defaults to None.
            stratum (Optional[Array], optional): array of the strata associated to the 
                population units. Defaults to None.
            mos (Optional[Array], optional): array of the measure of size associated to the 
                population units. Defaults to None.
            samp_rate (Union[Dict[Any, float], float, None], optional): sampling rate provided  
                by stratum if applicable. Defaults to None.

        Raises:
            AssertionError: raises an assertion error if some of the clusters are certainties. 

        Returns:
            np.ndarray: an array of the probabilities of inclusion. 
        """
        samp_unit = formats.sample_units(samp_unit, unique=True)

        if stratum is not None:
            stratum = formats.numpy_array(stratum)
        if mos is not None:
            mos = formats.numpy_array(mos)

        samp_size = formats.sample_size_dict(samp_size, self.stratification,
                                             stratum)

        if samp_size is not None:
            samp_size = self._convert_to_dict(samp_size, int)
        if samp_rate is not None:
            samp_rate = self._convert_to_dict(samp_rate, float)

        if self.method == "srs":
            incl_probs = self._srs_inclusion_probs(samp_unit, samp_size,
                                                   stratum)
        elif self.method in (
                "pps-brewer",
                "pps-hv",
                "pps-murphy",
                "pps-rs",
                "pps-sys",
        ):
            if self._anycertainty(samp_size, stratum, mos):
                raise AssertionError("Some clusters are certainties.")
            incl_probs = self._pps_inclusion_probs(samp_unit, samp_size, mos,
                                                   stratum)
        elif self.method == "sys":
            incl_probs = self._sys_inclusion_probs(samp_unit, samp_size,
                                                   stratum, samp_rate)

        return incl_probs
예제 #6
0
    def _pps_select(
        self,
        samp_unit: np.ndarray,
        samp_size: Dict[Any, int],
        stratum: np.ndarray,
        mos: np.ndarray,
    ) -> Tuple[np.ndarray, np.ndarray]:

        samp_unit = formats.sample_units(samp_unit, unique=True)
        samp_size = formats.sample_size_dict(samp_size, self.stratification,
                                             stratum)

        sample = hits = np.zeros(samp_unit.size).astype("int")
        if self.stratification:
            for s in np.unique(stratum):
                stratum_units = stratum == s
                if self.method in "pps-sys":  # systematic
                    (
                        sample[stratum_units],
                        hits[stratum_units],
                    ) = self._pps_sys_select(
                        samp_unit[stratum_units],
                        samp_size[s],
                        mos[stratum_units],
                    )
                elif self.method in "pps-hv":  # "hanurav-vijayan"
                    (
                        sample[stratum_units],
                        hits[stratum_units],
                    ) = self._pps_hv_select(
                        samp_unit[stratum_units],
                        samp_size[s],
                        mos[stratum_units],
                    )
                elif self.method in "pps-brewer":
                    (
                        sample[stratum_units],
                        hits[stratum_units],
                    ) = self._pps_brewer_select(
                        samp_unit[stratum_units],
                        samp_size[s],
                        mos[stratum_units],
                    )
                elif self.method in "pps-murphy":
                    (
                        sample[stratum_units],
                        hits[stratum_units],
                    ) = self._pps_murphy_select(
                        samp_unit[stratum_units],
                        samp_size[s],
                        mos[stratum_units],
                    )
                elif self.method in "pps-rs":
                    (
                        sample[stratum_units],
                        hits[stratum_units],
                    ) = self._pps_rs_select(
                        samp_unit[stratum_units],
                        samp_size[s],
                        mos[stratum_units],
                    )
        else:
            if self.method in "pps-sys":  # systematic
                sample, hits = self._pps_sys_select(samp_unit,
                                                    samp_size["__none__"], mos)
            elif self.method in "pps-hv":  # "hanurav-vijayan"
                sample, hits = self._pps_hv_select(samp_unit,
                                                   samp_size["__none__"], mos)
            elif self.method in "pps-brewer":
                sample, hits = self._pps_brewer_select(samp_unit,
                                                       samp_size["__none__"],
                                                       mos)
            elif self.method in "pps-murphy":
                sample, hits = self._pps_murphy_select(samp_unit,
                                                       samp_size["__none__"],
                                                       mos)
            elif self.method in "pps-rs":
                sample, hits = self._pps_rs_select(samp_unit,
                                                   samp_size["__none__"], mos)

        return sample, hits