def estimate_H(X, Y, estimator, B, ratio, discrete_output = None): """Estimating H with Block or incomplete U-statistics estimator :param B: Block size :param ratio: size of incomplete U-statistics estimator """ assert Y.shape[0] == X.shape[0] n = X.shape[0] p = X.shape[1] x_bw = util.meddistance(X, subsample = 1000)**2 kx = kernel.KGauss(x_bw) if discrete_output is not None: freq_dict1, freq_dict2 = discrete_output ky = KDiscrete(freq_dict1, freq_dict2) else: y_bw = util.meddistance(Y[:, np.newaxis], subsample = 1000)**2 ky = kernel.KGauss(y_bw) if estimator == 'inc': hsic_H = hsic.HSIC_Inc(kx, ky, ratio = ratio) m = int(n * ratio) else: # 'block' hsic_H = hsic.HSIC_Block(kx, ky, bsize = B) m = int(np.floor(n / B)) H_estimates = np.zeros((p, m)) for i in range(p): H_estimates[i, :] = np.reshape(hsic_H.estimates(X[:, i, np.newaxis], Y[:, np.newaxis]), -1) H = np.mean(H_estimates, axis = 1) return H_estimates, H, m
def sel_inf(self, X, Y, inf_type, alpha, niv, H0 = None, M0 = None, i = None): """Post-selection inference :param X, Y: covariate and response data :param inf_type: one-sided hypothesis testing or two-sided confidence interval calculation :param alpha: level 1-alpha :param niv: number of important variables, used for reporting results H0, M0 and i are not used """ assert inf_type == 'test' p = X.shape[1] # Initialising kernels x_bw = util.meddistance(X, subsample = 1000)**2 kx = kernel.KGauss(x_bw) if self.discrete_output: ky = KDiscrete() else: y_bw = util.meddistance(Y[:, np.newaxis], subsample = 1000)**2 ky = kernel.KGauss(y_bw) if self.estimator == 'inc': hsic_H = hsic.HSIC_Inc(kx, ky, ratio = self.l) else: # 'block' hsic_H = hsic.HSIC_Block(kx, ky, bsize = self.B) if self.poly: feat_select = PolySel(hsic_H) else: # multi feat_select = MultiSel(hsic_H) results = feat_select.test(X, Y[:, np.newaxis], args = self.n_select, alpha = alpha) # Reporting sel_vars = results['sel_vars'] h0_rejs = results['h0_rejs'] ind_sc_np = None ind_sel_np = np.zeros(p) ind_sel_np[sel_vars] = 1 ind_h0_rej = np.zeros(p) ind_h0_rej[sel_vars[h0_rejs]] = 1 ind_h0_rej = {'H' : ind_h0_rej} ind_h0_rej_true = np.zeros(p) ind_h0_rej_true[sel_vars] = 1 ind_h0_rej_true[niv:] = 0 ind_h0_rej_true = {'H' : ind_h0_rej_true} if self.poly: # p-values not provided for Poly p_values = None else: p_values = -np.ones(p) p_values[sel_vars] = results['pvals'] p_values = {'H': p_values} return sim.Inference_Result(p, ind_sc_np, ind_sel_np, ind_h0_rej, ind_h0_rej_true, p_values, None)
def estimate_M(X, estimator, B, ratio): """Estimating M with Block or incomplete U-statistics estimator :param B: Block size :param ratio: size of incomplete U-statistics estimator """ p = X.shape[1] x_bw = util.meddistance(X, subsample = 1000)**2 kx = kernel.KGauss(x_bw) if estimator == 'inc': hsic_M = hsic.HSIC_Inc(kx, kx, ratio = ratio) else: # 'block' hsic_M = hsic.HSIC_Block(kx, kx, bsize = B) M_true = np.zeros((p, p)) for i in range(p): for j in range(i+1): M_true[i, j] = np.mean(hsic_M.estimates(X[:, i, np.newaxis], X[:, j, np.newaxis])) M_true[j, i] = M_true[i, j] M = nearestPD(M_true) # positive definite approximation return M_true, M
def sel_inf(self, X, Y, inf_type, alpha, niv, H0=None, M0=None, i=None, unbiased_parallel=False, n_jobs=20): """Post-selection inference :param X, Y: covariate and response data :param inf_type: one-sided hypothesis testing or two-sided confidence interval calculation :param alpha: level 1-alpha :param niv: number of important variables, used for reporting results H0, M0, i, unbiased_parallel and n_jobs are not used """ assert inf_type == 'test' p = X.shape[1] # Initialising kernels x_bw = util.meddistance(X, subsample=1000)**2 kx = kernel.KGauss(x_bw) if self.discrete_output: values, counts = np.unique(Y, return_counts=True) freq_dict = dict(zip(values, counts)) ky = KDiscrete(freq_dict, freq_dict) else: y_bw = util.meddistance(Y[:, np.newaxis], subsample=1000)**2 ky = kernel.KGauss(y_bw) if self.estimator == 'inc': hsic_H = hsic.HSIC_Inc(kx, ky, ratio=self.l) else: # 'block' hsic_H = hsic.HSIC_Block(kx, ky, bsize=self.B) if self.poly: feat_select = PolySel(hsic_H) else: # multi feat_select = MultiSel(hsic_H) # Behaviour for evaluation of power w.r.t. first feature if self.only_evaluate_first: params = hsic_H.compute(X, Y[:, np.newaxis]) sel_vars = np.argpartition(params, -self.n_select, axis=0)[-self.n_select:] # only continue if the first feature was selected if 0 in sel_vars: results = feat_select.test(X, Y[:, np.newaxis], args=self.n_select, alpha=alpha) sel_vars = results['sel_vars'] h0_rejs = results['h0_rejs'] else: # fake values sel_vars = np.arange(p - self.n_select, p) h0_rejs = np.array([self.n_select - 1]) # Regular behaviour else: results = feat_select.test(X, Y[:, np.newaxis], args=self.n_select, alpha=alpha) sel_vars = results['sel_vars'] h0_rejs = results['h0_rejs'] # Reporting ind_sc_np = None ind_sel_np = np.zeros(p) ind_sel_np[sel_vars] = 1 ind_h0_rej = np.zeros(p) ind_h0_rej[sel_vars[h0_rejs]] = 1 ind_h0_rej = {'H': ind_h0_rej} ind_h0_rej_true = np.zeros(p) ind_h0_rej_true[sel_vars] = 1 ind_h0_rej_true[niv:] = 0 ind_h0_rej_true = {'H': ind_h0_rej_true} p_values = -np.ones(p) # p-values not provided for Poly # p-values not of interest for evaluation of empirical power if not self.poly and not self.only_evaluate_first: p_values[sel_vars] = results['pvals'] p_values = {'H': p_values} inf_res = sim.Inference_Result(p, ind_sc_np, ind_sel_np, ind_h0_rej, ind_h0_rej_true, p_values, None) return inf_res