Пример #1
0
    def normalize(
        self,
        samp_weight: Array,
        control: Union[Dict[StringNumber, Number], Number, None] = None,
        domain: Optional[Array] = None,
    ) -> np.ndarray:
        """normalizes the sample weights to sum to a known constants or levels. 

        Args:
            samp_weight (array) : array of the pre-adjustment sample weight. This vector should 
                contains numeric values.   
            control (int, float, dictionary) : a number or array of the level to calibrate the 
                sum of the weights. Default is number of units by domain key or overall if domain 
                is None. Defaults to None.
            domain (Optional[Array], optional) : array indicating the normalization class for each 
                sample unit. Defaults to None.
        
        Returns:
            An arrays: the normalized sample weight.
        """

        samp_weight = formats.numpy_array(samp_weight)
        norm_weight = samp_weight.copy()

        if domain is not None:
            domain = formats.numpy_array(domain)
            keys = np.unique(domain)
            levels: np.ndarray = np.zeros(keys.size) * np.nan
            for k, key in enumerate(keys):
                weight_k = samp_weight[domain == key]
                if control is None:
                    levels[k] = np.sum(domain == key)
                elif control is not None and isinstance(control, Dict):
                    levels[k] = control[key]
                elif isinstance(control, (float, int)):
                    levels[k] = control

                (
                    norm_weight[domain == key],
                    self.adjust_factor[key],
                ) = self._norm_adjustment(weight_k, levels[k])
                self.control[key] = levels[k]
        else:
            if control is None:
                control = {"__none__": np.sum(samp_weight.size).astype("int")}
            elif isinstance(control, (int, float)):
                control = {"__none__": control}

            norm_weight, self.adjust_factor[
                "__none__"] = self._norm_adjustment(samp_weight,
                                                    control["__none__"])
            self.control["__none__"] = control["__none__"]

        self.adjust_method = "normalization"

        return norm_weight
Пример #2
0
    def _degree_of_freedom(
        self, weight: np.ndarray, stratum: np.ndarray = None, psu: np.ndarray = None,
    ) -> None:

        stratum = formats.numpy_array(stratum)
        psu = formats.numpy_array(psu)

        if stratum.size <= 1:
            self.degree_of_freedom = np.unique(psu).size - 1
        elif psu.size > 1:
            self.degree_of_freedom = np.unique(psu).size - np.unique(stratum).size
        else:
            weight = formats.numpy_array(weight)
            self.degree_of_freedom = weight.size
Пример #3
0
    def deff_weight(
            self,
            samp_weight: Array,
            domain: Optional[np.ndarray] = None) -> Dict[StringNumber, Number]:
        """Computes the design effect due to unequal weights.

        Args:
            samp_weight (Array):  array of the pre-adjustment sample weight. This vector 
                should contains numeric values. 
            domain (Optional[np.ndarray], optional): array indicating the normalization class
                for each sample unit. Defaults to None. Defaults to None.

        Returns:
            Dict[StringNumber, Number]: dictionnary pairing the domains to the design effects due 
                unequal weights.
        """

        samp_weight = formats.numpy_array(samp_weight)

        deff_w: Dict[StringNumber, Number] = {}
        if domain is None:
            deff_w["__none__"] = self._deff_wgt(samp_weight)
        else:
            for d in np.unique(domain):
                deff_w[d] = self._deff_wgt(samp_weight[domain == d])
        self.deff_wgt = deff_w

        return deff_w
Пример #4
0
    def predict(self,
                X: Array,
                area: Array,
                b_const: Union[np.array, Number] = 1.0,
                intercept: bool = True) -> None:
        """Provides the modelled area levels estimates and their MSE estimates. 

        Args:
            X (Array): an multi-dimensional array of the auxiliary variables associated to 
            areas to predict. 
            area (Array): provides the areas for the prediction. 
            error_std (Array): 
            b_const (Union[np.array, Number], optional): [description]. Defaults to 1.0.

        Raises:
            Exception: [description]
        """

        if not self.fitted:
            raise Exception(
                "The model must be fitted first with .fit() before running the prediction."
            )

        if isinstance(b_const, (int, float)):
            b_const = np.ones(area.size) * b_const
        else:
            b_const = formats.numpy_array(b_const)

        area = formats.numpy_array(area)
        X = formats.numpy_array(X)
        if intercept and isinstance(X, np.ndarray):
            X = np.insert(X, 0, 1, axis=1)

        point_est, mse, mse1, mse2, g1, g2, g3, g3_star = self._eb_estimates(
            X=X,
            area=area,
            beta=self.fixed_effects,
            sigma2_e=self.error_std**2,
            sigma2_v=self.re_std**2,
            sigma2_v_cov=self.re_std_cov,
            b_const=b_const,
        )

        self.area_est = dict(zip(area, point_est))
        self.area_mse = dict(zip(area, mse))
Пример #5
0
    def _degree_of_freedom(
        self,
        samp_weight: np.ndarray,
        stratum: np.ndarray = None,
        psu: np.ndarray = None,
    ) -> None:

        stratum = formats.numpy_array(stratum)
        psu = formats.numpy_array(psu)

        if stratum.size <= 1:
            self.number_psus = np.unique(psu).size
            self.number_strata = 1
        elif psu.size > 1:
            self.number_psus = np.unique([stratum, psu], axis=1).shape[1]
            self.number_strata = np.unique(stratum).size
        else:
            samp_weight = formats.numpy_array(samp_weight)
            self.degree_of_freedom = samp_weight.size

        self.degree_of_freedom = self.number_psus - self.number_strata
Пример #6
0
    def _response(resp_status: np.ndarray,
                  resp_dict: np.ndarray) -> np.ndarray:

        resp_status = formats.numpy_array(resp_status)
        checks.assert_response_status(resp_status, resp_dict)

        if not np.isin(resp_status, ("in", "rr", "nr", "uk")).any():
            resp_code = np.repeat("  ", resp_status.size).astype(str)
            resp_code[resp_status == resp_dict["in"]] = "in"
            resp_code[resp_status == resp_dict["rr"]] = "rr"
            resp_code[resp_status == resp_dict["nr"]] = "nr"
            resp_code[resp_status == resp_dict["uk"]] = "uk"
        else:
            resp_code = resp_status

        return resp_code
Пример #7
0
def _plot_measure(
    y: np.ndarray,
    coef_min: Number = -5,
    coef_max: Number = 5,
    nb_points: int = 100,
    measure: str = "skewness",
) -> None:
    y = formats.numpy_array(y)
    lambda_range = np.linspace(coef_min, coef_max, num=nb_points)
    coefs = np.zeros(lambda_range.size)
    for k, ll in enumerate(lambda_range):
        y_ll = transform(y, ll)
        if measure.lower() == "skewness":
            coefs[k] = skewness(y_ll)
            measure_loc = "lower right"
        elif measure.lower() == "kurtosis":
            coefs[k] = kurtosis(y_ll)
            measure_loc = "upper right"

    normality = np.abs(coefs) < 2.0

    p1 = plt.scatter(
        lambda_range[normality],
        coefs[normality],
        marker="D",
        c="green",
        s=25,
        alpha=0.3,
    )
    p2 = plt.scatter(
        lambda_range[~normality],
        coefs[~normality],
        c="red",
        s=10,
        alpha=0.6,
        edgecolors="none",
    )
    plt.axhline(0, color="blue", linestyle="--")
    plt.title(f"{measure.title()} by BoxCox lambda")
    plt.ylabel(f"{measure.title()}")
    plt.xlabel("Lambda (coefs)")
    legent = plt.legend(
        (p1, p2),
        ("Normality zone", "Non-normality zone"),
        loc=measure_loc,
    )
    plt.show()
Пример #8
0
    def _remove_nans(
        self,
        excluded_units: Array,
        y: Array,
        samp_weight: Array,
        x: Array = None,
        stratum: Array = None,
        domain: Array = None,
        psu: Array = None,
        ssu: Array = None,
    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray],
               Optional[np.ndarray], Optional[np.ndarray],
               Optional[np.ndarray], ]:
        y = formats.numpy_array(y)
        samp_weight = formats.numpy_array(samp_weight)
        if x is not None:
            x = formats.numpy_array(x)
            x = x[~excluded_units]
        if stratum is not None:
            stratum = formats.numpy_array(stratum)
            stratum = stratum[~excluded_units]
        if domain is not None:
            domain = formats.numpy_array(domain)
            domain = domain[~excluded_units]
        if psu is not None:
            psu = formats.numpy_array(psu)
            psu = psu[~excluded_units]
        if ssu is not None:
            ssu = formats.numpy_array(ssu)
            ssu = ssu[~excluded_units]

        return (
            y[~excluded_units],
            samp_weight[~excluded_units],
            x,
            stratum,
            domain,
            psu,
            ssu,
        )
Пример #9
0
    def select(
        self,
        samp_unit: Array,
        samp_size: Union[Dict[Any, int], int, None] = None,
        stratum: Optional[Array] = None,
        mos: Optional[Array] = None,
        samp_rate: Union[Dict[Any, float], float, None] = None,
        probs: Optional[Array] = None,
        shuffle: bool = False,
        to_dataframe: bool = False,
        sample_only: bool = False,
    ) -> Union[pd.DataFrame, Tuple[np.ndarray, np.ndarray, np.ndarray]]:
        """Selects the random sample. 

        Args:
            samp_unit (Array): an array of all the observations in the target population. 
            samp_size (Union[Dict[Any, int], int, None], optional): the dictionary of sample 
            sizes by stratum, if applicable. Defaults to None.
            stratum (Optional[Array], optional): array of the strata associated to the 
                population units. Defaults to None.
            mos (Optional[Array], optional): array of the measure of size associated to the 
                population units. Defaults to None.
            samp_rate (Union[Dict[Any, float], float, None], optional): sampling rate provided  
                by stratum if applicable. Defaults to None.
            probs (Optional[Array], optional): array of the probability of selection associated to  the population units. Defaults to None.
            shuffle (bool, optional): indicates whether to shuffle the data prior to running the 
                selection algorithm. Defaults to False.
            to_dataframe (bool, optional): indicates whether to convert the output to a pandas 
                dataframe. Defaults to False.
            sample_only (bool, optional): indicates whether to return only the sample without
                the out of sample units. Defaults to False.

        Raises:
            AssertionError: raises an assertion error if both samp_size and samp_rate is 
                provided as input.
            AssertionError: raises an assertion error if some of the clusters are certainties. 

        Returns:
            Union[pd.DataFrame, Tuple[np.ndarray, np.ndarray, np.ndarray]]: [description]
        """

        samp_unit = formats.sample_units(samp_unit, unique=True)

        if stratum is not None:
            stratum = formats.numpy_array(stratum)
        if mos is not None:
            mos = formats.numpy_array(mos)
        if probs is not None:
            probs = formats.numpy_array(probs)

        if samp_size is not None and samp_rate is not None:
            raise AssertionError(
                "Both samp_size and samp_rate are provided. Only one of the two parameters should be specified."
            )

        if samp_size is not None:
            samp_size = formats.sample_size_dict(samp_size,
                                                 self.stratification, stratum)
            samp_size = self._convert_to_dict(samp_size, int)
        if samp_rate is not None:
            samp_rate = self._convert_to_dict(samp_rate, float)

        if shuffle and self.method in ("sys", "pps-sys"):
            suffled_order = np.random.shuffle(range(samp_unit.size))
            samp_unit = samp_unit[suffled_order]
            if stratum is not None:
                stratum = stratum[suffled_order]
            if self.method == "pps-sys" and mos is not None:
                mos = mos[suffled_order]

        if self.method == "srs":
            probs = self._srs_inclusion_probs(samp_unit,
                                              samp_size,
                                              stratum=stratum)
            sample, hits = self._grs_select(probs, samp_unit, samp_size,
                                            stratum)
        elif self.method in (
                "pps-brewer",
                "pps-hv",
                "pps-murphy",
                "pps-rs",
                "pps-sys",
        ):
            if self._anycertainty(samp_size, stratum, mos):
                raise AssertionError("Some clusters are certainties.")
            probs = self.inclusion_probs(samp_unit, samp_size, stratum, mos)
            sample, hits = self._pps_select(samp_unit, samp_size, stratum, mos)
        elif self.method == "sys":
            # probs = self._srs_inclusion_probs(samp_unit, samp_size, stratum)
            sample, hits = self._sys_select(samp_unit, samp_size, stratum,
                                            samp_rate)
        elif self.method == "grs":
            sample, hits = self._grs_select(probs, samp_unit, samp_size,
                                            stratum)

        if shuffle:
            sample = sample[suffled_order]
            hits = hits[suffled_order]

        if sample_only:
            frame = self._to_dataframe(samp_unit, stratum, mos, sample, hits,
                                       probs)
            return frame.loc[frame["_sample"] == 1]
        elif to_dataframe:
            frame = self._to_dataframe(samp_unit, stratum, mos, sample, hits,
                                       probs)
            return frame
        else:
            return sample, hits, probs
Пример #10
0
    def inclusion_probs(
        self,
        samp_unit: Array,
        samp_size: Union[Dict[Any, int], int],
        stratum: Optional[Array] = None,
        mos: Optional[Array] = None,
        samp_rate: Union[Dict[Any, float], float, None] = None,
    ) -> np.ndarray:
        """Computes the inclusion probabilities according to the sampling scheme. 

        Args:
            samp_unit (Array): an array of all the observations in the target population. 
            samp_size (Union[Dict[Any, int], int, None], optional): the dictionary of sample 
            sizes by stratum, if applicable. Defaults to None.
            stratum (Optional[Array], optional): array of the strata associated to the 
                population units. Defaults to None.
            mos (Optional[Array], optional): array of the measure of size associated to the 
                population units. Defaults to None.
            samp_rate (Union[Dict[Any, float], float, None], optional): sampling rate provided  
                by stratum if applicable. Defaults to None.

        Raises:
            AssertionError: raises an assertion error if some of the clusters are certainties. 

        Returns:
            np.ndarray: an array of the probabilities of inclusion. 
        """
        samp_unit = formats.sample_units(samp_unit, unique=True)

        if stratum is not None:
            stratum = formats.numpy_array(stratum)
        if mos is not None:
            mos = formats.numpy_array(mos)

        samp_size = formats.sample_size_dict(samp_size, self.stratification,
                                             stratum)

        if samp_size is not None:
            samp_size = self._convert_to_dict(samp_size, int)
        if samp_rate is not None:
            samp_rate = self._convert_to_dict(samp_rate, float)

        if self.method == "srs":
            incl_probs = self._srs_inclusion_probs(samp_unit, samp_size,
                                                   stratum)
        elif self.method in (
                "pps-brewer",
                "pps-hv",
                "pps-murphy",
                "pps-rs",
                "pps-sys",
        ):
            if self._anycertainty(samp_size, stratum, mos):
                raise AssertionError("Some clusters are certainties.")
            incl_probs = self._pps_inclusion_probs(samp_unit, samp_size, mos,
                                                   stratum)
        elif self.method == "sys":
            incl_probs = self._sys_inclusion_probs(samp_unit, samp_size,
                                                   stratum, samp_rate)

        return incl_probs
Пример #11
0
def kurtosis(y: Array) -> float:

    y = formats.numpy_array(y)
    kurtosis = float(np.mean((y - np.mean(y))**4) / np.std(y)**4 - 3)

    return kurtosis
Пример #12
0
def skewness(y: Array) -> float:

    y = formats.numpy_array(y)
    skewness = float(np.mean((y - np.mean(y))**3) / np.std(y)**3)

    return skewness
Пример #13
0
    def replicate(
        self,
        samp_weight: Array,
        psu: Array,
        stratum: Array = None,
        rep_coefs: Union[Array, Number] = False,
        rep_prefix: str = None,
        psu_varname: str = "_psu",
        str_varname: str = "_stratum",
    ) -> pd.DataFrame:
        """Computes replicate sample weights. 

        Args:
            samp_weight (Array): array of sample weights. To incorporate the weights adjustment 
                in the replicate weights, first replicate the design sample weights then apply 
                the adjustments to the replicates. 
            psu (Array): 
            stratum (Array, optional): array of the strata. Defaults to None.
            rep_coefs (Union[Array, Number], optional): coefficients associated to the replicates.
                Defaults to False.
            rep_prefix (str, optional): prefix to apply to the replicate weights names. 
                Defaults to None.
            psu_varname (str, optional): name of the psu variable in the output dataframe. 
                Defaults to "_psu".
            str_varname (str, optional): name of the stratum variable in the output dataframe. 
                Defaults to "_stratum".

        Raises:
            AssertionError: raises an assertion error when stratum is None for a stratified design.
            AssertionError: raises an assertion error when the replication method is not valid.

        Returns:
            pd.DataFrame: a dataframe of the replicates sample weights.
        """

        samp_weight = formats.numpy_array(samp_weight)

        if not self.stratification:
            stratum = None

        self._degree_of_freedom(samp_weight, stratum, psu)

        if self.stratification and stratum is None:
            raise AssertionError("For a stratified design, stratum must be specified.")
        elif stratum is not None:
            stratum_psu = pd.DataFrame({str_varname: stratum, psu_varname: psu})
            stratum_psu.sort_values(by=str_varname, inplace=True)
            key = [str_varname, psu_varname]
        elif self.method == "brr":
            _, str_index = np.unique(psu, return_index=True)
            checks.assert_brr_number_psus(str_index)
            psus = psu[np.sort(str_index)]
            strata = np.repeat(range(1, psus.size // 2 + 1), 2)
            stratum_psu = pd.DataFrame({str_varname: strata, psu_varname: psus})
            psu_pd = pd.DataFrame({psu_varname: psu})
            stratum_psu = pd.merge(psu_pd, stratum_psu, on=psu_varname, how="left", sort=False)
            stratum_psu = stratum_psu[[str_varname, psu_varname]]
            key = [str_varname, psu_varname]
        else:
            stratum_psu = pd.DataFrame({psu_varname: psu})
            key = [psu_varname]

        psus_ids = stratum_psu.drop_duplicates()

        if self.method == "jackknife":
            self.number_reps = psus_ids.shape[0]
            _rep_data = self._jkn_replicates(psu, stratum)
        elif self.method == "bootstrap":
            _rep_data = self._boot_replicates(psu, stratum)
        elif self.method == "brr":
            _rep_data = self._brr_replicates(psu, stratum)
            self.rep_coefs = list(
                (1 / self.number_reps * pow(1 - self.fay_coef, 2)) * np.ones(self.number_reps)
            )
        else:
            raise AssertionError(
                "Replication method not recognized. Possible options are: 'bootstrap', 'brr', and 'jackknife'"
            )

        rep_prefix = self._rep_prefix(rep_prefix)
        _rep_data = self._reps_to_dataframe(psus_ids, _rep_data, rep_prefix)

        samp_weight = pd.DataFrame({"_samp_weight": samp_weight})
        samp_weight.reset_index(drop=True, inplace=True)
        full_sample = pd.concat([stratum_psu, samp_weight], axis=1)
        full_sample = pd.merge(full_sample, _rep_data, on=key, how="left", sort=False)

        if not rep_coefs:
            rep_cols = [col for col in full_sample if col.startswith(rep_prefix)]
            full_sample[rep_cols] = full_sample[rep_cols].mul(samp_weight.values, axis=0)

        return full_sample
Пример #14
0
    def estimate(
        self: TypeRepEst,
        y: Array,
        samp_weight: Array,
        rep_weights: Union[np.ndarray, pd.DataFrame],
        x: Union[np.ndarray, pd.DataFrame, None] = None,
        rep_coefs: Union[float, np.ndarray, None] = None,
        domain: Optional[np.ndarray] = None,
        conservative: bool = False,
        deff: bool = False,  # Todo
        remove_nan: bool = False,
    ) -> TypeRepEst:
        """[summary]

        Args:
            self (TypeRepEst): [description]
            y (Array): [description]
            samp_weight (Array): [description]
            rep_weights (Union[np.ndarray, pd.DataFrame]): [description]
            x (Union[np.ndarray, pd.DataFrame, None], optional): [description]. Defaults to None.
            rep_coefs (Union[float, np.ndarray, None], optional): [description]. Defaults to None.
            domain (Optional[np.ndarray], optional): [description]. Defaults to None.
            conservative (bool, optional): [description]. Defaults to False.
            deff (bool, optional): [description]. Defaults to False.

        Raises:
            AssertionError: [description]

        Returns:
            TypeRepEst: [description]
        """

        if self.parameter == "ratio" and x is None:
            raise AssertionError("x must be provided for ratio estimation.")

        if not isinstance(rep_weights, np.ndarray):
            rep_weights = formats.numpy_array(rep_weights)

        if remove_nan:
            if self.parameter == "ratio":
                excluded_units = np.isnan(y) | np.isnan(x)
            else:
                excluded_units = np.isnan(y)
            y, samp_weight, x, _, domain, _, _ = self._remove_nans(
                excluded_units, y, samp_weight, x, None, domain, None, None)
            rep_weights = rep_weights[~excluded_units, :]

        self.conservative = conservative

        if self.number_reps is None:
            self.number_reps = rep_weights.shape[1]

        self._rep_coefs(rep_coefs)

        if domain is not None:
            self.domains = np.unique(domain)

        self.point_est = self._get_point(y, samp_weight, x, domain)
        self.variance = self._get_variance(
            y,
            samp_weight,
            rep_weights,
            np.array(self.rep_coefs),
            x,
            domain,
            conservative,
            remove_nan,
        )

        if self.method == "brr" and self.degree_of_freedom is None:
            self.degree_of_freedom = int(self.number_reps / 2)
        elif self.degree_of_freedom is None:
            self.degree_of_freedom = int(self.number_reps) - 1

        t_quantile = student.ppf(1 - self.alpha / 2, df=self.degree_of_freedom)

        self.lower_ci, self.upper_ci = self._get_confint(
            self.parameter, self.point_est, self.variance, t_quantile)
        self.coef_var = self._get_coefvar(self.parameter, self.point_est,
                                          self.variance)

        for key in self.variance:
            if self.parameter == "proportion":
                stderror = {}
                for level in self.variance[key]:
                    stderror[level] = pow(self.variance[key][level], 0.5)
                self.stderror[key] = stderror
            else:
                self.stderror[key] = pow(self.variance[key], 0.5)

        return self
Пример #15
0
    def fit(
        self,
        yhat: Array,
        X: Array,
        area: Array,
        error_std: Array,
        re_std_start: float = 0.001,
        b_const: Union[np.array, Number] = 1.0,
        intercept: bool = True,
        tol: float = 1e-8,
        maxiter: int = 100,
    ) -> None:
        """Fits the linear mixed models to estimate the fixed effects and the standard error of 
        the random effects. In addition, the method provides statistics related to the model 
        fitting e.g. convergence status, log-likelihood, and more.  

        Args:
            yhat (Array): an array of the estimated area level survey estimates also called 
            the direct estimates. 
            X (Array): an multi-dimensional array of the auxiliary information associated to 
            the sampled areas. 
            area (Array): provides the areas associated to the direct estimates. 
            error_std (Array): [description]
            re_std_start (float, optional): [description]. Defaults to 0.001.
            b_const (Union[np.array, Number], optional): [description]. Defaults to 1.0.
            tol (float, optional): tolerance used for convergence criteria. Defaults to 1.0e-4.
            maxiter (int, optional): maximum number of iterations for the fitting algorithm. 
            Defaults to 100.
        """

        if isinstance(b_const, (int, float)):
            b_const = np.ones(area.size) * b_const
        else:
            b_const = formats.numpy_array(b_const)

        area = formats.numpy_array(area)
        yhat = formats.numpy_array(yhat)
        X = formats.numpy_array(X)
        if intercept and isinstance(X, np.ndarray):
            X = np.insert(X, 0, 1, axis=1)

        (
            sigma2_v,
            sigma2_v_cov,
            iterations,
            tolerance,
            convergence,
        ) = self._iterative_fisher_scoring(
            area=area,
            yhat=yhat,
            X=X,
            sigma2_e=error_std**2,
            b_const=b_const,
            sigma2_v_start=re_std_start**2,
            tol=tol,
            maxiter=maxiter,
        )

        beta, beta_cov = self._fixed_coefficients(
            area=area,
            yhat=yhat,
            X=X,
            sigma2_e=error_std**2,
            sigma2_v=sigma2_v,
            b_const=b_const,
        )

        self.yhat = yhat
        self.error_std = error_std
        self.X = X
        self.area = area
        self.fixed_effects = beta
        self.fe_std = np.diag(beta_cov)**(1 / 2)
        self.re_std = sigma2_v**(1 / 2)
        self.re_std_cov = sigma2_v_cov

        self.convergence["achieved"] = convergence
        self.convergence["iterations"] = iterations
        self.convergence["precision"] = tolerance

        m = yhat.size
        p = X.shape[1] + 1
        Z_b2_Z = np.ones(shape=(m, m))
        V = np.diag(error_std**2) + sigma2_v * Z_b2_Z
        logllike = self._log_likelihood(yhat,
                                        X=X,
                                        beta=self.fixed_effects,
                                        V=V)
        self.goodness["loglike"] = logllike
        self.goodness["AIC"] = -2 * logllike + 2 * (p + 1)
        self.goodness["BIC"] = -2 * logllike + math.log(m) * (p + 1)

        self.fitted = True
Пример #16
0
    def adjust(
        self,
        samp_weight: np.ndarray,
        adjust_class: np.ndarray,
        resp_status: np.ndarray,
        resp_dict: Union[Dict[str, StringNumber], None] = None,
        unknown_to_inelig: bool = True,
    ) -> np.ndarray:
        """adjusts sample weight to account for non-response. 

        Args:
            samp_weight (np.ndarray): array of the pre-adjustment sample weight. This vector 
                should contains numeric values. 
            adjust_class (np.ndarray): array indicating the adjustment class for each sample unit. 
                The sample weight adjustments will be performed within the classes defined by this 
                parameter.
            resp_status (np.ndarray): array indicating the eligibility and response status of the 
                sample unit. Values of resp_status should inform on ineligible (in), respondent (rr), nonrespondent (nr), not known / unknown (uk). If the values of the parameter are not in ("in", "rr", "nr", "uk") then the resp_dict is required.
            resp_dict (Union[Dict[str, StringNumber], None], optional): dictionnary providing the 
                mapping between the values of resp_status and the ["in", "rr", "nr", "uk"]. 
                For example, if the response status are: 0 for ineligible, 1 for respondent, 
                2 for nonrespondent, and 9 for unknown. Then the dictionary will be {"in": 0, "rr": 1, "nr": 2, "uk": 9}. If the response status variable has only values in ("in", "rr", "nr", "uk") then the dictionary is not needed. Optional parameter. Defaults to None.
            unknown_to_inelig (bool, optional): [description]. Defaults to True.

        Raises:
            AssertionError: raises an assertion error if adjust_class is not a list, numpy array, 
            or pandas dataframe/series. 

        Returns:
            np.ndarray: array of the adjusted sample weights.
        """

        resp_code = self._response(resp_status, resp_dict)
        samp_weight = formats.numpy_array(samp_weight)
        adjusted_weight = np.ones(samp_weight.size) * np.nan

        if adjust_class is None:
            (
                adjust_factor,
                self.adjust_factor["__none__"],
            ) = self._adjust_factor(samp_weight, resp_code, unknown_to_inelig)
            adjusted_weight = adjust_factor * samp_weight
        else:
            if isinstance(adjust_class, list):
                adjust_class = pd.DataFrame(np.column_stack(adjust_class))
            elif isinstance(adjust_class, np.ndarray):
                adjust_class = pd.DataFrame(adjust_class)
            elif not isinstance(adjust_class, (pd.Series, pd.DataFrame)):
                raise AssertionError(
                    "adjust_class must be an numpy ndarray, a list of numpy ndarray or a pandas dataframe."
                )

            adjust_array = formats.dataframe_to_array(adjust_class)

            for c in np.unique(adjust_array):
                samp_weight_c = samp_weight[adjust_array == c]
                resp_code_c = resp_code[adjust_array == c]
                adjust_factor_c, self.adjust_factor[c] = self._adjust_factor(
                    samp_weight_c, resp_code_c, unknown_to_inelig)
                adjusted_weight[adjust_array ==
                                c] = adjust_factor_c * samp_weight_c

        self.deff_wgt = self.deff_weight(adjusted_weight)
        self.adjust_method = "nonresponse"

        return adjusted_weight
Пример #17
0
    def calibrate(
        self,
        samp_weight: Array,
        aux_vars: Array,
        control: Union[Dict[StringNumber, Union[DictStrNum, Number]],
                       None] = None,
        domain: Optional[Array] = None,
        scale: Union[Array, Number] = 1,
        bounded: bool = False,
        additive: bool = False,
    ) -> np.ndarray:
        """Calibrates the sample weights.

        Args:
            samp_weight (Array): array of sample weights.
            aux_vars (Array): array of auxiliary variables. 
            control (Union[Dict[StringNumber, Union[DictStrNum, Number]], None], optional):     
                provides the controls by domain if applicable. Defaults to None.
            domain (Optional[Array], optional): Array indicating the normalization class for each 
                sample unit. Defaults to None.
            scale (Union[Array, Number], optional): [description]. Defaults to 1.
            bounded (bool, optional): [description]. Defaults to False.
            additive (bool, optional): [description]. Defaults to False.

        Returns:
            np.ndarray: an array of the calibrated sample weights.
        """

        samp_size = samp_weight.size

        samp_weight = formats.numpy_array(samp_weight)
        aux_vars = formats.numpy_array(aux_vars)
        if domain is not None:
            domain = formats.numpy_array(domain)
        if isinstance(scale, (float, int)):
            scale = np.repeat(scale, samp_size)
        if aux_vars.shape == (samp_size, ):
            x_w = aux_vars * samp_weight
            one_dimension = True
        else:
            x_w = np.transpose(aux_vars) * samp_weight
            one_dimension = False

        if domain is None:
            if one_dimension:
                x_w_total = np.sum(x_w)
            else:
                x_w_total = np.sum(x_w, axis=1)
            core_factor = self._core_matrix(
                samp_weight=samp_weight,
                x=aux_vars,
                x_weighted_total=x_w_total,
                x_control=np.array(list(control.values())),
                scale=scale,
            )
            adjust_factor = 1 + self._calib_wgt(aux_vars, core_factor) / scale
        else:
            domains = np.unique(domain)
            if additive:
                adjust_factor = np.ones((samp_size, domains.size)) * np.nan
            else:
                adjust_factor = np.ones(samp_size) * np.nan

            for k, d in enumerate(domains):
                if one_dimension:
                    x_w_total = np.sum(x_w)
                else:
                    x_w_total = np.sum(x_w, axis=1)

                x_d = aux_vars[domain == d]
                samp_weight_d = samp_weight[domain == d]
                if one_dimension:
                    x_w_total_d = np.sum(x_w[domain == d])
                else:
                    x_w_total_d = np.sum(np.transpose(x_w)[domain == d],
                                         axis=0)

                control_d = control.get(d)
                if isinstance(control_d, (int, float)):
                    control_d_values = [control_d]
                elif isinstance(control_d, Dict):
                    control_d_values = list(control_d.values())

                scale_d = scale[domain == d]
                if additive:
                    core_factor_d = self._core_matrix(
                        samp_weight=samp_weight,
                        x=aux_vars,
                        x_weighted_total=x_w_total_d,
                        x_control=np.array(control_d_values),
                        scale=scale,
                    )
                    adjust_factor[:, k] = (domain == d) + self._calib_wgt(
                        aux_vars, core_factor_d) / scale
                else:
                    core_factor_d = self._core_matrix(
                        samp_weight=samp_weight_d,
                        x=aux_vars[domain == d],
                        x_weighted_total=x_w_total_d,
                        x_control=np.array(control_d_values),
                        scale=scale_d,
                    )
                    adjust_factor[domain == d] = 1 + self._calib_wgt(
                        x_d, core_factor_d) / scale_d

        if additive:
            calib_weight = np.transpose(
                np.transpose(adjust_factor) * samp_weight)
        else:
            calib_weight = samp_weight * adjust_factor

        self.adjust_method = "calibration"

        return calib_weight