def normalize( self, samp_weight: Array, control: Union[Dict[StringNumber, Number], Number, None] = None, domain: Optional[Array] = None, ) -> np.ndarray: """normalizes the sample weights to sum to a known constants or levels. Args: samp_weight (array) : array of the pre-adjustment sample weight. This vector should contains numeric values. control (int, float, dictionary) : a number or array of the level to calibrate the sum of the weights. Default is number of units by domain key or overall if domain is None. Defaults to None. domain (Optional[Array], optional) : array indicating the normalization class for each sample unit. Defaults to None. Returns: An arrays: the normalized sample weight. """ samp_weight = formats.numpy_array(samp_weight) norm_weight = samp_weight.copy() if domain is not None: domain = formats.numpy_array(domain) keys = np.unique(domain) levels: np.ndarray = np.zeros(keys.size) * np.nan for k, key in enumerate(keys): weight_k = samp_weight[domain == key] if control is None: levels[k] = np.sum(domain == key) elif control is not None and isinstance(control, Dict): levels[k] = control[key] elif isinstance(control, (float, int)): levels[k] = control ( norm_weight[domain == key], self.adjust_factor[key], ) = self._norm_adjustment(weight_k, levels[k]) self.control[key] = levels[k] else: if control is None: control = {"__none__": np.sum(samp_weight.size).astype("int")} elif isinstance(control, (int, float)): control = {"__none__": control} norm_weight, self.adjust_factor[ "__none__"] = self._norm_adjustment(samp_weight, control["__none__"]) self.control["__none__"] = control["__none__"] self.adjust_method = "normalization" return norm_weight
def _degree_of_freedom( self, weight: np.ndarray, stratum: np.ndarray = None, psu: np.ndarray = None, ) -> None: stratum = formats.numpy_array(stratum) psu = formats.numpy_array(psu) if stratum.size <= 1: self.degree_of_freedom = np.unique(psu).size - 1 elif psu.size > 1: self.degree_of_freedom = np.unique(psu).size - np.unique(stratum).size else: weight = formats.numpy_array(weight) self.degree_of_freedom = weight.size
def deff_weight( self, samp_weight: Array, domain: Optional[np.ndarray] = None) -> Dict[StringNumber, Number]: """Computes the design effect due to unequal weights. Args: samp_weight (Array): array of the pre-adjustment sample weight. This vector should contains numeric values. domain (Optional[np.ndarray], optional): array indicating the normalization class for each sample unit. Defaults to None. Defaults to None. Returns: Dict[StringNumber, Number]: dictionnary pairing the domains to the design effects due unequal weights. """ samp_weight = formats.numpy_array(samp_weight) deff_w: Dict[StringNumber, Number] = {} if domain is None: deff_w["__none__"] = self._deff_wgt(samp_weight) else: for d in np.unique(domain): deff_w[d] = self._deff_wgt(samp_weight[domain == d]) self.deff_wgt = deff_w return deff_w
def predict(self, X: Array, area: Array, b_const: Union[np.array, Number] = 1.0, intercept: bool = True) -> None: """Provides the modelled area levels estimates and their MSE estimates. Args: X (Array): an multi-dimensional array of the auxiliary variables associated to areas to predict. area (Array): provides the areas for the prediction. error_std (Array): b_const (Union[np.array, Number], optional): [description]. Defaults to 1.0. Raises: Exception: [description] """ if not self.fitted: raise Exception( "The model must be fitted first with .fit() before running the prediction." ) if isinstance(b_const, (int, float)): b_const = np.ones(area.size) * b_const else: b_const = formats.numpy_array(b_const) area = formats.numpy_array(area) X = formats.numpy_array(X) if intercept and isinstance(X, np.ndarray): X = np.insert(X, 0, 1, axis=1) point_est, mse, mse1, mse2, g1, g2, g3, g3_star = self._eb_estimates( X=X, area=area, beta=self.fixed_effects, sigma2_e=self.error_std**2, sigma2_v=self.re_std**2, sigma2_v_cov=self.re_std_cov, b_const=b_const, ) self.area_est = dict(zip(area, point_est)) self.area_mse = dict(zip(area, mse))
def _degree_of_freedom( self, samp_weight: np.ndarray, stratum: np.ndarray = None, psu: np.ndarray = None, ) -> None: stratum = formats.numpy_array(stratum) psu = formats.numpy_array(psu) if stratum.size <= 1: self.number_psus = np.unique(psu).size self.number_strata = 1 elif psu.size > 1: self.number_psus = np.unique([stratum, psu], axis=1).shape[1] self.number_strata = np.unique(stratum).size else: samp_weight = formats.numpy_array(samp_weight) self.degree_of_freedom = samp_weight.size self.degree_of_freedom = self.number_psus - self.number_strata
def _response(resp_status: np.ndarray, resp_dict: np.ndarray) -> np.ndarray: resp_status = formats.numpy_array(resp_status) checks.assert_response_status(resp_status, resp_dict) if not np.isin(resp_status, ("in", "rr", "nr", "uk")).any(): resp_code = np.repeat(" ", resp_status.size).astype(str) resp_code[resp_status == resp_dict["in"]] = "in" resp_code[resp_status == resp_dict["rr"]] = "rr" resp_code[resp_status == resp_dict["nr"]] = "nr" resp_code[resp_status == resp_dict["uk"]] = "uk" else: resp_code = resp_status return resp_code
def _plot_measure( y: np.ndarray, coef_min: Number = -5, coef_max: Number = 5, nb_points: int = 100, measure: str = "skewness", ) -> None: y = formats.numpy_array(y) lambda_range = np.linspace(coef_min, coef_max, num=nb_points) coefs = np.zeros(lambda_range.size) for k, ll in enumerate(lambda_range): y_ll = transform(y, ll) if measure.lower() == "skewness": coefs[k] = skewness(y_ll) measure_loc = "lower right" elif measure.lower() == "kurtosis": coefs[k] = kurtosis(y_ll) measure_loc = "upper right" normality = np.abs(coefs) < 2.0 p1 = plt.scatter( lambda_range[normality], coefs[normality], marker="D", c="green", s=25, alpha=0.3, ) p2 = plt.scatter( lambda_range[~normality], coefs[~normality], c="red", s=10, alpha=0.6, edgecolors="none", ) plt.axhline(0, color="blue", linestyle="--") plt.title(f"{measure.title()} by BoxCox lambda") plt.ylabel(f"{measure.title()}") plt.xlabel("Lambda (coefs)") legent = plt.legend( (p1, p2), ("Normality zone", "Non-normality zone"), loc=measure_loc, ) plt.show()
def _remove_nans( self, excluded_units: Array, y: Array, samp_weight: Array, x: Array = None, stratum: Array = None, domain: Array = None, psu: Array = None, ssu: Array = None, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray], ]: y = formats.numpy_array(y) samp_weight = formats.numpy_array(samp_weight) if x is not None: x = formats.numpy_array(x) x = x[~excluded_units] if stratum is not None: stratum = formats.numpy_array(stratum) stratum = stratum[~excluded_units] if domain is not None: domain = formats.numpy_array(domain) domain = domain[~excluded_units] if psu is not None: psu = formats.numpy_array(psu) psu = psu[~excluded_units] if ssu is not None: ssu = formats.numpy_array(ssu) ssu = ssu[~excluded_units] return ( y[~excluded_units], samp_weight[~excluded_units], x, stratum, domain, psu, ssu, )
def select( self, samp_unit: Array, samp_size: Union[Dict[Any, int], int, None] = None, stratum: Optional[Array] = None, mos: Optional[Array] = None, samp_rate: Union[Dict[Any, float], float, None] = None, probs: Optional[Array] = None, shuffle: bool = False, to_dataframe: bool = False, sample_only: bool = False, ) -> Union[pd.DataFrame, Tuple[np.ndarray, np.ndarray, np.ndarray]]: """Selects the random sample. Args: samp_unit (Array): an array of all the observations in the target population. samp_size (Union[Dict[Any, int], int, None], optional): the dictionary of sample sizes by stratum, if applicable. Defaults to None. stratum (Optional[Array], optional): array of the strata associated to the population units. Defaults to None. mos (Optional[Array], optional): array of the measure of size associated to the population units. Defaults to None. samp_rate (Union[Dict[Any, float], float, None], optional): sampling rate provided by stratum if applicable. Defaults to None. probs (Optional[Array], optional): array of the probability of selection associated to the population units. Defaults to None. shuffle (bool, optional): indicates whether to shuffle the data prior to running the selection algorithm. Defaults to False. to_dataframe (bool, optional): indicates whether to convert the output to a pandas dataframe. Defaults to False. sample_only (bool, optional): indicates whether to return only the sample without the out of sample units. Defaults to False. Raises: AssertionError: raises an assertion error if both samp_size and samp_rate is provided as input. AssertionError: raises an assertion error if some of the clusters are certainties. Returns: Union[pd.DataFrame, Tuple[np.ndarray, np.ndarray, np.ndarray]]: [description] """ samp_unit = formats.sample_units(samp_unit, unique=True) if stratum is not None: stratum = formats.numpy_array(stratum) if mos is not None: mos = formats.numpy_array(mos) if probs is not None: probs = formats.numpy_array(probs) if samp_size is not None and samp_rate is not None: raise AssertionError( "Both samp_size and samp_rate are provided. Only one of the two parameters should be specified." ) if samp_size is not None: samp_size = formats.sample_size_dict(samp_size, self.stratification, stratum) samp_size = self._convert_to_dict(samp_size, int) if samp_rate is not None: samp_rate = self._convert_to_dict(samp_rate, float) if shuffle and self.method in ("sys", "pps-sys"): suffled_order = np.random.shuffle(range(samp_unit.size)) samp_unit = samp_unit[suffled_order] if stratum is not None: stratum = stratum[suffled_order] if self.method == "pps-sys" and mos is not None: mos = mos[suffled_order] if self.method == "srs": probs = self._srs_inclusion_probs(samp_unit, samp_size, stratum=stratum) sample, hits = self._grs_select(probs, samp_unit, samp_size, stratum) elif self.method in ( "pps-brewer", "pps-hv", "pps-murphy", "pps-rs", "pps-sys", ): if self._anycertainty(samp_size, stratum, mos): raise AssertionError("Some clusters are certainties.") probs = self.inclusion_probs(samp_unit, samp_size, stratum, mos) sample, hits = self._pps_select(samp_unit, samp_size, stratum, mos) elif self.method == "sys": # probs = self._srs_inclusion_probs(samp_unit, samp_size, stratum) sample, hits = self._sys_select(samp_unit, samp_size, stratum, samp_rate) elif self.method == "grs": sample, hits = self._grs_select(probs, samp_unit, samp_size, stratum) if shuffle: sample = sample[suffled_order] hits = hits[suffled_order] if sample_only: frame = self._to_dataframe(samp_unit, stratum, mos, sample, hits, probs) return frame.loc[frame["_sample"] == 1] elif to_dataframe: frame = self._to_dataframe(samp_unit, stratum, mos, sample, hits, probs) return frame else: return sample, hits, probs
def inclusion_probs( self, samp_unit: Array, samp_size: Union[Dict[Any, int], int], stratum: Optional[Array] = None, mos: Optional[Array] = None, samp_rate: Union[Dict[Any, float], float, None] = None, ) -> np.ndarray: """Computes the inclusion probabilities according to the sampling scheme. Args: samp_unit (Array): an array of all the observations in the target population. samp_size (Union[Dict[Any, int], int, None], optional): the dictionary of sample sizes by stratum, if applicable. Defaults to None. stratum (Optional[Array], optional): array of the strata associated to the population units. Defaults to None. mos (Optional[Array], optional): array of the measure of size associated to the population units. Defaults to None. samp_rate (Union[Dict[Any, float], float, None], optional): sampling rate provided by stratum if applicable. Defaults to None. Raises: AssertionError: raises an assertion error if some of the clusters are certainties. Returns: np.ndarray: an array of the probabilities of inclusion. """ samp_unit = formats.sample_units(samp_unit, unique=True) if stratum is not None: stratum = formats.numpy_array(stratum) if mos is not None: mos = formats.numpy_array(mos) samp_size = formats.sample_size_dict(samp_size, self.stratification, stratum) if samp_size is not None: samp_size = self._convert_to_dict(samp_size, int) if samp_rate is not None: samp_rate = self._convert_to_dict(samp_rate, float) if self.method == "srs": incl_probs = self._srs_inclusion_probs(samp_unit, samp_size, stratum) elif self.method in ( "pps-brewer", "pps-hv", "pps-murphy", "pps-rs", "pps-sys", ): if self._anycertainty(samp_size, stratum, mos): raise AssertionError("Some clusters are certainties.") incl_probs = self._pps_inclusion_probs(samp_unit, samp_size, mos, stratum) elif self.method == "sys": incl_probs = self._sys_inclusion_probs(samp_unit, samp_size, stratum, samp_rate) return incl_probs
def kurtosis(y: Array) -> float: y = formats.numpy_array(y) kurtosis = float(np.mean((y - np.mean(y))**4) / np.std(y)**4 - 3) return kurtosis
def skewness(y: Array) -> float: y = formats.numpy_array(y) skewness = float(np.mean((y - np.mean(y))**3) / np.std(y)**3) return skewness
def replicate( self, samp_weight: Array, psu: Array, stratum: Array = None, rep_coefs: Union[Array, Number] = False, rep_prefix: str = None, psu_varname: str = "_psu", str_varname: str = "_stratum", ) -> pd.DataFrame: """Computes replicate sample weights. Args: samp_weight (Array): array of sample weights. To incorporate the weights adjustment in the replicate weights, first replicate the design sample weights then apply the adjustments to the replicates. psu (Array): stratum (Array, optional): array of the strata. Defaults to None. rep_coefs (Union[Array, Number], optional): coefficients associated to the replicates. Defaults to False. rep_prefix (str, optional): prefix to apply to the replicate weights names. Defaults to None. psu_varname (str, optional): name of the psu variable in the output dataframe. Defaults to "_psu". str_varname (str, optional): name of the stratum variable in the output dataframe. Defaults to "_stratum". Raises: AssertionError: raises an assertion error when stratum is None for a stratified design. AssertionError: raises an assertion error when the replication method is not valid. Returns: pd.DataFrame: a dataframe of the replicates sample weights. """ samp_weight = formats.numpy_array(samp_weight) if not self.stratification: stratum = None self._degree_of_freedom(samp_weight, stratum, psu) if self.stratification and stratum is None: raise AssertionError("For a stratified design, stratum must be specified.") elif stratum is not None: stratum_psu = pd.DataFrame({str_varname: stratum, psu_varname: psu}) stratum_psu.sort_values(by=str_varname, inplace=True) key = [str_varname, psu_varname] elif self.method == "brr": _, str_index = np.unique(psu, return_index=True) checks.assert_brr_number_psus(str_index) psus = psu[np.sort(str_index)] strata = np.repeat(range(1, psus.size // 2 + 1), 2) stratum_psu = pd.DataFrame({str_varname: strata, psu_varname: psus}) psu_pd = pd.DataFrame({psu_varname: psu}) stratum_psu = pd.merge(psu_pd, stratum_psu, on=psu_varname, how="left", sort=False) stratum_psu = stratum_psu[[str_varname, psu_varname]] key = [str_varname, psu_varname] else: stratum_psu = pd.DataFrame({psu_varname: psu}) key = [psu_varname] psus_ids = stratum_psu.drop_duplicates() if self.method == "jackknife": self.number_reps = psus_ids.shape[0] _rep_data = self._jkn_replicates(psu, stratum) elif self.method == "bootstrap": _rep_data = self._boot_replicates(psu, stratum) elif self.method == "brr": _rep_data = self._brr_replicates(psu, stratum) self.rep_coefs = list( (1 / self.number_reps * pow(1 - self.fay_coef, 2)) * np.ones(self.number_reps) ) else: raise AssertionError( "Replication method not recognized. Possible options are: 'bootstrap', 'brr', and 'jackknife'" ) rep_prefix = self._rep_prefix(rep_prefix) _rep_data = self._reps_to_dataframe(psus_ids, _rep_data, rep_prefix) samp_weight = pd.DataFrame({"_samp_weight": samp_weight}) samp_weight.reset_index(drop=True, inplace=True) full_sample = pd.concat([stratum_psu, samp_weight], axis=1) full_sample = pd.merge(full_sample, _rep_data, on=key, how="left", sort=False) if not rep_coefs: rep_cols = [col for col in full_sample if col.startswith(rep_prefix)] full_sample[rep_cols] = full_sample[rep_cols].mul(samp_weight.values, axis=0) return full_sample
def estimate( self: TypeRepEst, y: Array, samp_weight: Array, rep_weights: Union[np.ndarray, pd.DataFrame], x: Union[np.ndarray, pd.DataFrame, None] = None, rep_coefs: Union[float, np.ndarray, None] = None, domain: Optional[np.ndarray] = None, conservative: bool = False, deff: bool = False, # Todo remove_nan: bool = False, ) -> TypeRepEst: """[summary] Args: self (TypeRepEst): [description] y (Array): [description] samp_weight (Array): [description] rep_weights (Union[np.ndarray, pd.DataFrame]): [description] x (Union[np.ndarray, pd.DataFrame, None], optional): [description]. Defaults to None. rep_coefs (Union[float, np.ndarray, None], optional): [description]. Defaults to None. domain (Optional[np.ndarray], optional): [description]. Defaults to None. conservative (bool, optional): [description]. Defaults to False. deff (bool, optional): [description]. Defaults to False. Raises: AssertionError: [description] Returns: TypeRepEst: [description] """ if self.parameter == "ratio" and x is None: raise AssertionError("x must be provided for ratio estimation.") if not isinstance(rep_weights, np.ndarray): rep_weights = formats.numpy_array(rep_weights) if remove_nan: if self.parameter == "ratio": excluded_units = np.isnan(y) | np.isnan(x) else: excluded_units = np.isnan(y) y, samp_weight, x, _, domain, _, _ = self._remove_nans( excluded_units, y, samp_weight, x, None, domain, None, None) rep_weights = rep_weights[~excluded_units, :] self.conservative = conservative if self.number_reps is None: self.number_reps = rep_weights.shape[1] self._rep_coefs(rep_coefs) if domain is not None: self.domains = np.unique(domain) self.point_est = self._get_point(y, samp_weight, x, domain) self.variance = self._get_variance( y, samp_weight, rep_weights, np.array(self.rep_coefs), x, domain, conservative, remove_nan, ) if self.method == "brr" and self.degree_of_freedom is None: self.degree_of_freedom = int(self.number_reps / 2) elif self.degree_of_freedom is None: self.degree_of_freedom = int(self.number_reps) - 1 t_quantile = student.ppf(1 - self.alpha / 2, df=self.degree_of_freedom) self.lower_ci, self.upper_ci = self._get_confint( self.parameter, self.point_est, self.variance, t_quantile) self.coef_var = self._get_coefvar(self.parameter, self.point_est, self.variance) for key in self.variance: if self.parameter == "proportion": stderror = {} for level in self.variance[key]: stderror[level] = pow(self.variance[key][level], 0.5) self.stderror[key] = stderror else: self.stderror[key] = pow(self.variance[key], 0.5) return self
def fit( self, yhat: Array, X: Array, area: Array, error_std: Array, re_std_start: float = 0.001, b_const: Union[np.array, Number] = 1.0, intercept: bool = True, tol: float = 1e-8, maxiter: int = 100, ) -> None: """Fits the linear mixed models to estimate the fixed effects and the standard error of the random effects. In addition, the method provides statistics related to the model fitting e.g. convergence status, log-likelihood, and more. Args: yhat (Array): an array of the estimated area level survey estimates also called the direct estimates. X (Array): an multi-dimensional array of the auxiliary information associated to the sampled areas. area (Array): provides the areas associated to the direct estimates. error_std (Array): [description] re_std_start (float, optional): [description]. Defaults to 0.001. b_const (Union[np.array, Number], optional): [description]. Defaults to 1.0. tol (float, optional): tolerance used for convergence criteria. Defaults to 1.0e-4. maxiter (int, optional): maximum number of iterations for the fitting algorithm. Defaults to 100. """ if isinstance(b_const, (int, float)): b_const = np.ones(area.size) * b_const else: b_const = formats.numpy_array(b_const) area = formats.numpy_array(area) yhat = formats.numpy_array(yhat) X = formats.numpy_array(X) if intercept and isinstance(X, np.ndarray): X = np.insert(X, 0, 1, axis=1) ( sigma2_v, sigma2_v_cov, iterations, tolerance, convergence, ) = self._iterative_fisher_scoring( area=area, yhat=yhat, X=X, sigma2_e=error_std**2, b_const=b_const, sigma2_v_start=re_std_start**2, tol=tol, maxiter=maxiter, ) beta, beta_cov = self._fixed_coefficients( area=area, yhat=yhat, X=X, sigma2_e=error_std**2, sigma2_v=sigma2_v, b_const=b_const, ) self.yhat = yhat self.error_std = error_std self.X = X self.area = area self.fixed_effects = beta self.fe_std = np.diag(beta_cov)**(1 / 2) self.re_std = sigma2_v**(1 / 2) self.re_std_cov = sigma2_v_cov self.convergence["achieved"] = convergence self.convergence["iterations"] = iterations self.convergence["precision"] = tolerance m = yhat.size p = X.shape[1] + 1 Z_b2_Z = np.ones(shape=(m, m)) V = np.diag(error_std**2) + sigma2_v * Z_b2_Z logllike = self._log_likelihood(yhat, X=X, beta=self.fixed_effects, V=V) self.goodness["loglike"] = logllike self.goodness["AIC"] = -2 * logllike + 2 * (p + 1) self.goodness["BIC"] = -2 * logllike + math.log(m) * (p + 1) self.fitted = True
def adjust( self, samp_weight: np.ndarray, adjust_class: np.ndarray, resp_status: np.ndarray, resp_dict: Union[Dict[str, StringNumber], None] = None, unknown_to_inelig: bool = True, ) -> np.ndarray: """adjusts sample weight to account for non-response. Args: samp_weight (np.ndarray): array of the pre-adjustment sample weight. This vector should contains numeric values. adjust_class (np.ndarray): array indicating the adjustment class for each sample unit. The sample weight adjustments will be performed within the classes defined by this parameter. resp_status (np.ndarray): array indicating the eligibility and response status of the sample unit. Values of resp_status should inform on ineligible (in), respondent (rr), nonrespondent (nr), not known / unknown (uk). If the values of the parameter are not in ("in", "rr", "nr", "uk") then the resp_dict is required. resp_dict (Union[Dict[str, StringNumber], None], optional): dictionnary providing the mapping between the values of resp_status and the ["in", "rr", "nr", "uk"]. For example, if the response status are: 0 for ineligible, 1 for respondent, 2 for nonrespondent, and 9 for unknown. Then the dictionary will be {"in": 0, "rr": 1, "nr": 2, "uk": 9}. If the response status variable has only values in ("in", "rr", "nr", "uk") then the dictionary is not needed. Optional parameter. Defaults to None. unknown_to_inelig (bool, optional): [description]. Defaults to True. Raises: AssertionError: raises an assertion error if adjust_class is not a list, numpy array, or pandas dataframe/series. Returns: np.ndarray: array of the adjusted sample weights. """ resp_code = self._response(resp_status, resp_dict) samp_weight = formats.numpy_array(samp_weight) adjusted_weight = np.ones(samp_weight.size) * np.nan if adjust_class is None: ( adjust_factor, self.adjust_factor["__none__"], ) = self._adjust_factor(samp_weight, resp_code, unknown_to_inelig) adjusted_weight = adjust_factor * samp_weight else: if isinstance(adjust_class, list): adjust_class = pd.DataFrame(np.column_stack(adjust_class)) elif isinstance(adjust_class, np.ndarray): adjust_class = pd.DataFrame(adjust_class) elif not isinstance(adjust_class, (pd.Series, pd.DataFrame)): raise AssertionError( "adjust_class must be an numpy ndarray, a list of numpy ndarray or a pandas dataframe." ) adjust_array = formats.dataframe_to_array(adjust_class) for c in np.unique(adjust_array): samp_weight_c = samp_weight[adjust_array == c] resp_code_c = resp_code[adjust_array == c] adjust_factor_c, self.adjust_factor[c] = self._adjust_factor( samp_weight_c, resp_code_c, unknown_to_inelig) adjusted_weight[adjust_array == c] = adjust_factor_c * samp_weight_c self.deff_wgt = self.deff_weight(adjusted_weight) self.adjust_method = "nonresponse" return adjusted_weight
def calibrate( self, samp_weight: Array, aux_vars: Array, control: Union[Dict[StringNumber, Union[DictStrNum, Number]], None] = None, domain: Optional[Array] = None, scale: Union[Array, Number] = 1, bounded: bool = False, additive: bool = False, ) -> np.ndarray: """Calibrates the sample weights. Args: samp_weight (Array): array of sample weights. aux_vars (Array): array of auxiliary variables. control (Union[Dict[StringNumber, Union[DictStrNum, Number]], None], optional): provides the controls by domain if applicable. Defaults to None. domain (Optional[Array], optional): Array indicating the normalization class for each sample unit. Defaults to None. scale (Union[Array, Number], optional): [description]. Defaults to 1. bounded (bool, optional): [description]. Defaults to False. additive (bool, optional): [description]. Defaults to False. Returns: np.ndarray: an array of the calibrated sample weights. """ samp_size = samp_weight.size samp_weight = formats.numpy_array(samp_weight) aux_vars = formats.numpy_array(aux_vars) if domain is not None: domain = formats.numpy_array(domain) if isinstance(scale, (float, int)): scale = np.repeat(scale, samp_size) if aux_vars.shape == (samp_size, ): x_w = aux_vars * samp_weight one_dimension = True else: x_w = np.transpose(aux_vars) * samp_weight one_dimension = False if domain is None: if one_dimension: x_w_total = np.sum(x_w) else: x_w_total = np.sum(x_w, axis=1) core_factor = self._core_matrix( samp_weight=samp_weight, x=aux_vars, x_weighted_total=x_w_total, x_control=np.array(list(control.values())), scale=scale, ) adjust_factor = 1 + self._calib_wgt(aux_vars, core_factor) / scale else: domains = np.unique(domain) if additive: adjust_factor = np.ones((samp_size, domains.size)) * np.nan else: adjust_factor = np.ones(samp_size) * np.nan for k, d in enumerate(domains): if one_dimension: x_w_total = np.sum(x_w) else: x_w_total = np.sum(x_w, axis=1) x_d = aux_vars[domain == d] samp_weight_d = samp_weight[domain == d] if one_dimension: x_w_total_d = np.sum(x_w[domain == d]) else: x_w_total_d = np.sum(np.transpose(x_w)[domain == d], axis=0) control_d = control.get(d) if isinstance(control_d, (int, float)): control_d_values = [control_d] elif isinstance(control_d, Dict): control_d_values = list(control_d.values()) scale_d = scale[domain == d] if additive: core_factor_d = self._core_matrix( samp_weight=samp_weight, x=aux_vars, x_weighted_total=x_w_total_d, x_control=np.array(control_d_values), scale=scale, ) adjust_factor[:, k] = (domain == d) + self._calib_wgt( aux_vars, core_factor_d) / scale else: core_factor_d = self._core_matrix( samp_weight=samp_weight_d, x=aux_vars[domain == d], x_weighted_total=x_w_total_d, x_control=np.array(control_d_values), scale=scale_d, ) adjust_factor[domain == d] = 1 + self._calib_wgt( x_d, core_factor_d) / scale_d if additive: calib_weight = np.transpose( np.transpose(adjust_factor) * samp_weight) else: calib_weight = samp_weight * adjust_factor self.adjust_method = "calibration" return calib_weight