def _calculate_fpc( self, samp_unit: np.ndarray, samp_size: Union[Dict[Any, int], int], stratum: np.ndarray, ) -> None: samp_unit = formats.sample_units(samp_unit, unique=True) samp_size = formats.sample_size_dict(samp_size, self.stratification, stratum) self.fpc = dict() if self.stratification: strata = np.unique(stratum) for k, s in enumerate(strata): number_units_s = len(samp_unit[stratum == s]) self.fpc[s] = np.sqrt( (number_units_s - samp_size[s]) / (number_units_s - 1)) else: self.fpc["__none__"] = np.sqrt( (samp_unit.size - samp_size["__none__"]) / (samp_unit.size - 1))
def _srs_inclusion_probs( self, samp_unit: np.ndarray, samp_size: Union[Dict[Any, int], int], stratum: Optional[np.ndarray] = None, ) -> np.ndarray: samp_unit = formats.sample_units(samp_unit) samp_size = formats.sample_size_dict(samp_size, self.stratification, stratum) number_units = samp_unit.size if self.stratification: incl_probs = np.zeros(number_units) * np.nan for s in np.unique(stratum): number_units_s = samp_unit[stratum == s].size incl_probs[stratum == s] = samp_size[s] / number_units_s else: number_units = samp_unit.size incl_probs = np.ones( number_units) * samp_size["__none__"] / number_units return incl_probs
def _pps_inclusion_probs( self, samp_unit: np.ndarray, samp_size: Dict[Any, int], mos: np.ndarray, stratum: Optional[np.ndarray] = None, ) -> np.ndarray: samp_unit = formats.sample_units(samp_unit, unique=True) samp_size = formats.sample_size_dict(samp_size, self.stratification, stratum) if self.stratification: number_units = samp_unit.size incl_probs = np.zeros(number_units) * np.nan for s in np.unique(stratum): stratum_units = stratum == s mos_s = mos[stratum_units] incl_probs[stratum_units] = samp_size[s] * mos_s / np.sum( mos_s) else: incl_probs = samp_size["__none__"] * mos / np.sum(mos) return incl_probs
def select( self, samp_unit: Array, samp_size: Union[Dict[Any, int], int, None] = None, stratum: Optional[Array] = None, mos: Optional[Array] = None, samp_rate: Union[Dict[Any, float], float, None] = None, probs: Optional[Array] = None, shuffle: bool = False, to_dataframe: bool = False, sample_only: bool = False, ) -> Union[pd.DataFrame, Tuple[np.ndarray, np.ndarray, np.ndarray]]: """Selects the random sample. Args: samp_unit (Array): an array of all the observations in the target population. samp_size (Union[Dict[Any, int], int, None], optional): the dictionary of sample sizes by stratum, if applicable. Defaults to None. stratum (Optional[Array], optional): array of the strata associated to the population units. Defaults to None. mos (Optional[Array], optional): array of the measure of size associated to the population units. Defaults to None. samp_rate (Union[Dict[Any, float], float, None], optional): sampling rate provided by stratum if applicable. Defaults to None. probs (Optional[Array], optional): array of the probability of selection associated to the population units. Defaults to None. shuffle (bool, optional): indicates whether to shuffle the data prior to running the selection algorithm. Defaults to False. to_dataframe (bool, optional): indicates whether to convert the output to a pandas dataframe. Defaults to False. sample_only (bool, optional): indicates whether to return only the sample without the out of sample units. Defaults to False. Raises: AssertionError: raises an assertion error if both samp_size and samp_rate is provided as input. AssertionError: raises an assertion error if some of the clusters are certainties. Returns: Union[pd.DataFrame, Tuple[np.ndarray, np.ndarray, np.ndarray]]: [description] """ samp_unit = formats.sample_units(samp_unit, unique=True) if stratum is not None: stratum = formats.numpy_array(stratum) if mos is not None: mos = formats.numpy_array(mos) if probs is not None: probs = formats.numpy_array(probs) if samp_size is not None and samp_rate is not None: raise AssertionError( "Both samp_size and samp_rate are provided. Only one of the two parameters should be specified." ) if samp_size is not None: samp_size = formats.sample_size_dict(samp_size, self.stratification, stratum) samp_size = self._convert_to_dict(samp_size, int) if samp_rate is not None: samp_rate = self._convert_to_dict(samp_rate, float) if shuffle and self.method in ("sys", "pps-sys"): suffled_order = np.random.shuffle(range(samp_unit.size)) samp_unit = samp_unit[suffled_order] if stratum is not None: stratum = stratum[suffled_order] if self.method == "pps-sys" and mos is not None: mos = mos[suffled_order] if self.method == "srs": probs = self._srs_inclusion_probs(samp_unit, samp_size, stratum=stratum) sample, hits = self._grs_select(probs, samp_unit, samp_size, stratum) elif self.method in ( "pps-brewer", "pps-hv", "pps-murphy", "pps-rs", "pps-sys", ): if self._anycertainty(samp_size, stratum, mos): raise AssertionError("Some clusters are certainties.") probs = self.inclusion_probs(samp_unit, samp_size, stratum, mos) sample, hits = self._pps_select(samp_unit, samp_size, stratum, mos) elif self.method == "sys": # probs = self._srs_inclusion_probs(samp_unit, samp_size, stratum) sample, hits = self._sys_select(samp_unit, samp_size, stratum, samp_rate) elif self.method == "grs": sample, hits = self._grs_select(probs, samp_unit, samp_size, stratum) if shuffle: sample = sample[suffled_order] hits = hits[suffled_order] if sample_only: frame = self._to_dataframe(samp_unit, stratum, mos, sample, hits, probs) return frame.loc[frame["_sample"] == 1] elif to_dataframe: frame = self._to_dataframe(samp_unit, stratum, mos, sample, hits, probs) return frame else: return sample, hits, probs
def inclusion_probs( self, samp_unit: Array, samp_size: Union[Dict[Any, int], int], stratum: Optional[Array] = None, mos: Optional[Array] = None, samp_rate: Union[Dict[Any, float], float, None] = None, ) -> np.ndarray: """Computes the inclusion probabilities according to the sampling scheme. Args: samp_unit (Array): an array of all the observations in the target population. samp_size (Union[Dict[Any, int], int, None], optional): the dictionary of sample sizes by stratum, if applicable. Defaults to None. stratum (Optional[Array], optional): array of the strata associated to the population units. Defaults to None. mos (Optional[Array], optional): array of the measure of size associated to the population units. Defaults to None. samp_rate (Union[Dict[Any, float], float, None], optional): sampling rate provided by stratum if applicable. Defaults to None. Raises: AssertionError: raises an assertion error if some of the clusters are certainties. Returns: np.ndarray: an array of the probabilities of inclusion. """ samp_unit = formats.sample_units(samp_unit, unique=True) if stratum is not None: stratum = formats.numpy_array(stratum) if mos is not None: mos = formats.numpy_array(mos) samp_size = formats.sample_size_dict(samp_size, self.stratification, stratum) if samp_size is not None: samp_size = self._convert_to_dict(samp_size, int) if samp_rate is not None: samp_rate = self._convert_to_dict(samp_rate, float) if self.method == "srs": incl_probs = self._srs_inclusion_probs(samp_unit, samp_size, stratum) elif self.method in ( "pps-brewer", "pps-hv", "pps-murphy", "pps-rs", "pps-sys", ): if self._anycertainty(samp_size, stratum, mos): raise AssertionError("Some clusters are certainties.") incl_probs = self._pps_inclusion_probs(samp_unit, samp_size, mos, stratum) elif self.method == "sys": incl_probs = self._sys_inclusion_probs(samp_unit, samp_size, stratum, samp_rate) return incl_probs
def _pps_select( self, samp_unit: np.ndarray, samp_size: Dict[Any, int], stratum: np.ndarray, mos: np.ndarray, ) -> Tuple[np.ndarray, np.ndarray]: samp_unit = formats.sample_units(samp_unit, unique=True) samp_size = formats.sample_size_dict(samp_size, self.stratification, stratum) sample = hits = np.zeros(samp_unit.size).astype("int") if self.stratification: for s in np.unique(stratum): stratum_units = stratum == s if self.method in "pps-sys": # systematic ( sample[stratum_units], hits[stratum_units], ) = self._pps_sys_select( samp_unit[stratum_units], samp_size[s], mos[stratum_units], ) elif self.method in "pps-hv": # "hanurav-vijayan" ( sample[stratum_units], hits[stratum_units], ) = self._pps_hv_select( samp_unit[stratum_units], samp_size[s], mos[stratum_units], ) elif self.method in "pps-brewer": ( sample[stratum_units], hits[stratum_units], ) = self._pps_brewer_select( samp_unit[stratum_units], samp_size[s], mos[stratum_units], ) elif self.method in "pps-murphy": ( sample[stratum_units], hits[stratum_units], ) = self._pps_murphy_select( samp_unit[stratum_units], samp_size[s], mos[stratum_units], ) elif self.method in "pps-rs": ( sample[stratum_units], hits[stratum_units], ) = self._pps_rs_select( samp_unit[stratum_units], samp_size[s], mos[stratum_units], ) else: if self.method in "pps-sys": # systematic sample, hits = self._pps_sys_select(samp_unit, samp_size["__none__"], mos) elif self.method in "pps-hv": # "hanurav-vijayan" sample, hits = self._pps_hv_select(samp_unit, samp_size["__none__"], mos) elif self.method in "pps-brewer": sample, hits = self._pps_brewer_select(samp_unit, samp_size["__none__"], mos) elif self.method in "pps-murphy": sample, hits = self._pps_murphy_select(samp_unit, samp_size["__none__"], mos) elif self.method in "pps-rs": sample, hits = self._pps_rs_select(samp_unit, samp_size["__none__"], mos) return sample, hits