def estimate_H(X, Y, estimator, B, ratio, discrete_output = None): """Estimating H with Block or incomplete U-statistics estimator :param B: Block size :param ratio: size of incomplete U-statistics estimator """ assert Y.shape[0] == X.shape[0] n = X.shape[0] p = X.shape[1] x_bw = util.meddistance(X, subsample = 1000)**2 kx = kernel.KGauss(x_bw) if discrete_output is not None: freq_dict1, freq_dict2 = discrete_output ky = KDiscrete(freq_dict1, freq_dict2) else: y_bw = util.meddistance(Y[:, np.newaxis], subsample = 1000)**2 ky = kernel.KGauss(y_bw) if estimator == 'inc': hsic_H = hsic.HSIC_Inc(kx, ky, ratio = ratio) m = int(n * ratio) else: # 'block' hsic_H = hsic.HSIC_Block(kx, ky, bsize = B) m = int(np.floor(n / B)) H_estimates = np.zeros((p, m)) for i in range(p): H_estimates[i, :] = np.reshape(hsic_H.estimates(X[:, i, np.newaxis], Y[:, np.newaxis]), -1) H = np.mean(H_estimates, axis = 1) return H_estimates, H, m
def one_trial(i, n_samples, problem, n_select, algo): p, r = problem.sample(n_samples, i) if 'MMD' in mmd_or_hsic: bw = util.meddistance(np.vstack((p, r)), subsample=1000)**2 metric = mmd.MMD_Inc(kernel.KGauss(bw)) else: p_bw = util.meddistance(p, subsample=1000)**2 r_bw = util.meddistance(r, subsample=1000)**2 metric = hsic.HSIC_Inc(kernel.KGauss(p_bw), kernel.KGauss(r_bw), 5) feat_select = algo(metric) results = feat_select.test( p, r, args=n_select, seed=i, ) ## True selected features. if results['sel_vars'].shape[0] > 1: true = problem.is_true(results['sel_vars']) n_true = np.sum(true) fpr = np.sum(results['h0_rejs'][np.logical_not(true)]) / max( n_select - n_true, 1) tpr = np.sum(results['h0_rejs'][true]) / max(n_true, 1) else: tpr, fpr = 0, 0 logging.debug("TPR is :{0:.3f} FPR is :{1:.3f}".format( tpr, fpr, )) return tpr, fpr
def sel_inf(self, X, Y, inf_type, alpha, niv, H0 = None, M0 = None, i = None): """Post-selection inference :param X, Y: covariate and response data :param inf_type: one-sided hypothesis testing or two-sided confidence interval calculation :param alpha: level 1-alpha :param niv: number of important variables, used for reporting results H0, M0 and i are not used """ assert inf_type == 'test' p = X.shape[1] # Initialising kernels x_bw = util.meddistance(X, subsample = 1000)**2 kx = kernel.KGauss(x_bw) if self.discrete_output: ky = KDiscrete() else: y_bw = util.meddistance(Y[:, np.newaxis], subsample = 1000)**2 ky = kernel.KGauss(y_bw) if self.estimator == 'inc': hsic_H = hsic.HSIC_Inc(kx, ky, ratio = self.l) else: # 'block' hsic_H = hsic.HSIC_Block(kx, ky, bsize = self.B) if self.poly: feat_select = PolySel(hsic_H) else: # multi feat_select = MultiSel(hsic_H) results = feat_select.test(X, Y[:, np.newaxis], args = self.n_select, alpha = alpha) # Reporting sel_vars = results['sel_vars'] h0_rejs = results['h0_rejs'] ind_sc_np = None ind_sel_np = np.zeros(p) ind_sel_np[sel_vars] = 1 ind_h0_rej = np.zeros(p) ind_h0_rej[sel_vars[h0_rejs]] = 1 ind_h0_rej = {'H' : ind_h0_rej} ind_h0_rej_true = np.zeros(p) ind_h0_rej_true[sel_vars] = 1 ind_h0_rej_true[niv:] = 0 ind_h0_rej_true = {'H' : ind_h0_rej_true} if self.poly: # p-values not provided for Poly p_values = None else: p_values = -np.ones(p) p_values[sel_vars] = results['pvals'] p_values = {'H': p_values} return sim.Inference_Result(p, ind_sc_np, ind_sel_np, ind_h0_rej, ind_h0_rej_true, p_values, None)
def estimate_H_unbiased_parallel(X, Y, discrete_output = None): """Parallelised estimation of H with unbiased HSIC-estimator""" assert Y.shape[0] == X.shape[0] p = X.shape[1] x_bw = util.meddistance(X, subsample = 1000)**2 kx = kernel.KGauss(x_bw) if discrete_output is not None: freq_dict1, freq_dict2 = discrete_output ky = KDiscrete(freq_dict1, freq_dict2) else: y_bw = util.meddistance(Y[:, np.newaxis], subsample = 1000)**2 ky = kernel.KGauss(y_bw) hsic_H = hsic.HSIC_U(kx, ky) def one_calc(i): return hsic_H.compute(X[:,i,np.newaxis], Y[:,np.newaxis]) par = Parallel(n_jobs = -1) res = par(delayed(one_calc)(i) for i in range(p)) return np.array(res)
def estimate_H_unbiased(X, Y, discrete_output = None): """Estimating H with unbiased HSIC-estimator""" assert Y.shape[0] == X.shape[0] p = X.shape[1] # Creating X- and Y-kernels x_bw = util.meddistance(X, subsample = 1000)**2 # bandwith of Gaussian kernel kx = kernel.KGauss(x_bw) if discrete_output is not None: freq_dict1, freq_dict2 = discrete_output ky = KDiscrete(freq_dict1, freq_dict2) else: y_bw = util.meddistance(Y[:, np.newaxis], subsample = 1000)**2 ky = kernel.KGauss(y_bw) # H estimation hsic_H = hsic.HSIC_U(kx, ky) H = np.zeros(p) for i in range(p): H[i] = hsic_H.compute(X[:,i,np.newaxis], Y[:,np.newaxis]) return H
def estimate_M_unbiased(X): """Estimating M with unbiased HSIC-estimator""" p = X.shape[1] x_bw = util.meddistance(X, subsample = 1000)**2 kx = kernel.KGauss(x_bw) hsic_M = hsic.HSIC_U(kx, kx) M_true = np.zeros((p, p)) for i in range(p): for j in range(i+1): M_true[i, j] = hsic_M.compute(X[:,i,np.newaxis], X[:, j, np.newaxis]) M_true[j, i] = M_true[i, j] # due to symmetry M = nearestPD(M_true) # positive definite approximation return M_true, M
def one_trial(i, n_samples, algsel, problem, n_select, hsic_e, params): p, r = problem.sample(n_samples, i) p_bw = util.meddistance(p, subsample=1000)**2 r_bw = util.meddistance(r, subsample=1000)**2 hsic_e = hsic_e(kernel.KGauss(p_bw), kernel.KGauss(r_bw), params(n_samples)) feat_select = algsel(hsic_e, params=True) results = feat_select.test(p, r, args=n_select, seed=i) ## True selected features. if results['sel_vars'].shape[0] > 1: true = problem.is_true(results['sel_vars']) n_true = np.sum(true) fpr = np.sum(results['h0_rejs'][np.logical_not(true)]) / max( n_select - n_true, 1) tpr = np.sum(results['h0_rejs'][true]) / max(n_true, 1) else: tpr, fpr = 0, 0 logging.debug("TPR is :{0:.3f} FPR is :{1:.3f}".format(tpr, fpr)) return tpr, fpr
def estimate_M(X, estimator, B, ratio): """Estimating M with Block or incomplete U-statistics estimator :param B: Block size :param ratio: size of incomplete U-statistics estimator """ p = X.shape[1] x_bw = util.meddistance(X, subsample = 1000)**2 kx = kernel.KGauss(x_bw) if estimator == 'inc': hsic_M = hsic.HSIC_Inc(kx, kx, ratio = ratio) else: # 'block' hsic_M = hsic.HSIC_Block(kx, kx, bsize = B) M_true = np.zeros((p, p)) for i in range(p): for j in range(i+1): M_true[i, j] = np.mean(hsic_M.estimates(X[:, i, np.newaxis], X[:, j, np.newaxis])) M_true[j, i] = M_true[i, j] M = nearestPD(M_true) # positive definite approximation return M_true, M
def estimate_M_unbiased_parallel(X): """Parallelised estimation of M with unbiased HSIC-estimator""" p = X.shape[1] x_bw = util.meddistance(X, subsample = 1000)**2 kx = kernel.KGauss(x_bw) hsic_M = hsic.HSIC_U(kx, kx) M_true = np.zeros((p, p)) def one_calc(i, j): return hsic_M.compute(X[:,i,np.newaxis], X[:, j, np.newaxis]) par = Parallel(n_jobs = -1) res = par(delayed(one_calc)(i, j) for i in range(p) for j in range(i+1)) sp = 0 for i in range(p): for j in range(i+1): M_true[i, j] = M_true[j, i] = res[sp + j] sp += i+1 M = nearestPD(M_true) # positive definite approximation return M_true, M
def one_trial(i, n_samples, algsel, problem, n_select, mmd_e): p, r = problem.sample(n_samples, i) bw = util.meddistance(np.vstack((p, r)), subsample=1000)**2 mmd_u = mmd_e(kernel.KGauss(bw)) feat_select = algsel(mmd_u) results = feat_select.test(p, r, args=n_select, seed=i) ## True selected features. if results['sel_vars'].shape[0] > 1: true = problem.is_true(results['sel_vars']) n_true = np.sum(true) fpr = np.sum(results['h0_rejs'][np.logical_not(true)]) / max( n_select - n_true, 1) tpr = np.sum(results['h0_rejs'][true]) / max(n_true, 1) else: tpr, fpr = 0, 0 logging.debug("TPR is :{0:.3f} FPR is :{1:.3f}".format(tpr, fpr)) return tpr, fpr
def sel_inf(self, X, Y, inf_type, alpha, niv, H0=None, M0=None, i=None, unbiased_parallel=False, n_jobs=20): """Post-selection inference :param X, Y: covariate and response data :param inf_type: one-sided hypothesis testing or two-sided confidence interval calculation :param alpha: level 1-alpha :param niv: number of important variables, used for reporting results H0, M0, i, unbiased_parallel and n_jobs are not used """ assert inf_type == 'test' p = X.shape[1] # Initialising kernels x_bw = util.meddistance(X, subsample=1000)**2 kx = kernel.KGauss(x_bw) if self.discrete_output: values, counts = np.unique(Y, return_counts=True) freq_dict = dict(zip(values, counts)) ky = KDiscrete(freq_dict, freq_dict) else: y_bw = util.meddistance(Y[:, np.newaxis], subsample=1000)**2 ky = kernel.KGauss(y_bw) if self.estimator == 'inc': hsic_H = hsic.HSIC_Inc(kx, ky, ratio=self.l) else: # 'block' hsic_H = hsic.HSIC_Block(kx, ky, bsize=self.B) if self.poly: feat_select = PolySel(hsic_H) else: # multi feat_select = MultiSel(hsic_H) # Behaviour for evaluation of power w.r.t. first feature if self.only_evaluate_first: params = hsic_H.compute(X, Y[:, np.newaxis]) sel_vars = np.argpartition(params, -self.n_select, axis=0)[-self.n_select:] # only continue if the first feature was selected if 0 in sel_vars: results = feat_select.test(X, Y[:, np.newaxis], args=self.n_select, alpha=alpha) sel_vars = results['sel_vars'] h0_rejs = results['h0_rejs'] else: # fake values sel_vars = np.arange(p - self.n_select, p) h0_rejs = np.array([self.n_select - 1]) # Regular behaviour else: results = feat_select.test(X, Y[:, np.newaxis], args=self.n_select, alpha=alpha) sel_vars = results['sel_vars'] h0_rejs = results['h0_rejs'] # Reporting ind_sc_np = None ind_sel_np = np.zeros(p) ind_sel_np[sel_vars] = 1 ind_h0_rej = np.zeros(p) ind_h0_rej[sel_vars[h0_rejs]] = 1 ind_h0_rej = {'H': ind_h0_rej} ind_h0_rej_true = np.zeros(p) ind_h0_rej_true[sel_vars] = 1 ind_h0_rej_true[niv:] = 0 ind_h0_rej_true = {'H': ind_h0_rej_true} p_values = -np.ones(p) # p-values not provided for Poly # p-values not of interest for evaluation of empirical power if not self.poly and not self.only_evaluate_first: p_values[sel_vars] = results['pvals'] p_values = {'H': p_values} inf_res = sim.Inference_Result(p, ind_sc_np, ind_sel_np, ind_h0_rej, ind_h0_rej_true, p_values, None) return inf_res