Пример #1
0
def estimate_H(X, Y, estimator, B, ratio, discrete_output = None):
    """Estimating H with Block or incomplete U-statistics estimator
    :param B: Block size
    :param ratio: size of incomplete U-statistics estimator
    """
    assert Y.shape[0] == X.shape[0]
    n = X.shape[0]
    p = X.shape[1]  
    x_bw = util.meddistance(X, subsample = 1000)**2
    kx = kernel.KGauss(x_bw)
    if discrete_output is not None:
        freq_dict1, freq_dict2 = discrete_output
        ky = KDiscrete(freq_dict1, freq_dict2)
    else:
        y_bw = util.meddistance(Y[:, np.newaxis], subsample = 1000)**2
        ky = kernel.KGauss(y_bw)
    
    if estimator == 'inc':
        hsic_H = hsic.HSIC_Inc(kx, ky, ratio = ratio)
        m = int(n * ratio)
    else: # 'block'
        hsic_H = hsic.HSIC_Block(kx, ky, bsize = B)
        m = int(np.floor(n / B))

    H_estimates = np.zeros((p, m))
    for i in range(p):
        H_estimates[i, :] = np.reshape(hsic_H.estimates(X[:, i, np.newaxis],
                                                        Y[:, np.newaxis]), -1)
    H = np.mean(H_estimates, axis = 1)
    return H_estimates, H, m
Пример #2
0
def one_trial(i, n_samples, problem, n_select, algo):
    p, r = problem.sample(n_samples, i)

    if 'MMD' in mmd_or_hsic:
        bw = util.meddistance(np.vstack((p, r)), subsample=1000)**2
        metric = mmd.MMD_Inc(kernel.KGauss(bw))
    else:
        p_bw = util.meddistance(p, subsample=1000)**2
        r_bw = util.meddistance(r, subsample=1000)**2
        metric = hsic.HSIC_Inc(kernel.KGauss(p_bw), kernel.KGauss(r_bw), 5)

    feat_select = algo(metric)

    results = feat_select.test(
        p,
        r,
        args=n_select,
        seed=i,
    )

    ## True selected features.
    if results['sel_vars'].shape[0] > 1:
        true = problem.is_true(results['sel_vars'])
        n_true = np.sum(true)
        fpr = np.sum(results['h0_rejs'][np.logical_not(true)]) / max(
            n_select - n_true, 1)
        tpr = np.sum(results['h0_rejs'][true]) / max(n_true, 1)
    else:
        tpr, fpr = 0, 0
    logging.debug("TPR is :{0:.3f}  FPR is :{1:.3f}".format(
        tpr,
        fpr,
    ))
    return tpr, fpr
Пример #3
0
 def sel_inf(self, X, Y, inf_type, alpha, niv, H0 = None, M0 = None, i = None):
     """Post-selection inference
     :param X, Y: covariate and response data
     :param inf_type: one-sided hypothesis testing or two-sided confidence interval calculation
     :param alpha: level 1-alpha
     :param niv: number of important variables, used for reporting results
     H0, M0 and i are not used
     """
     assert inf_type == 'test'
     p = X.shape[1]
     # Initialising kernels
     x_bw = util.meddistance(X, subsample = 1000)**2
     kx = kernel.KGauss(x_bw)
     if self.discrete_output:
         ky = KDiscrete()
     else:
         y_bw = util.meddistance(Y[:, np.newaxis], subsample = 1000)**2
         ky = kernel.KGauss(y_bw)
     
     if self.estimator == 'inc':
         hsic_H = hsic.HSIC_Inc(kx, ky, ratio = self.l)
     else: # 'block'
         hsic_H = hsic.HSIC_Block(kx, ky, bsize = self.B)
     
     if self.poly:
         feat_select = PolySel(hsic_H)
     else: # multi
         feat_select = MultiSel(hsic_H)
     results = feat_select.test(X, Y[:, np.newaxis], args = self.n_select, alpha = alpha)
     
     # Reporting
     sel_vars = results['sel_vars']
     h0_rejs = results['h0_rejs']
     
     ind_sc_np = None
     ind_sel_np = np.zeros(p)
     ind_sel_np[sel_vars] = 1
     
     ind_h0_rej = np.zeros(p)
     ind_h0_rej[sel_vars[h0_rejs]] = 1        
     ind_h0_rej = {'H' : ind_h0_rej}
     
     ind_h0_rej_true = np.zeros(p)
     ind_h0_rej_true[sel_vars] = 1
     ind_h0_rej_true[niv:] = 0
     ind_h0_rej_true = {'H' : ind_h0_rej_true}
     
     if self.poly:
         # p-values not provided for Poly
         p_values = None
     else:
         p_values = -np.ones(p)
         p_values[sel_vars] = results['pvals']
         p_values = {'H': p_values}
     return sim.Inference_Result(p, ind_sc_np, ind_sel_np, ind_h0_rej,
                                    ind_h0_rej_true, p_values, None)
Пример #4
0
def estimate_H_unbiased_parallel(X, Y, discrete_output = None):
    """Parallelised estimation of H with unbiased HSIC-estimator"""
    assert Y.shape[0] == X.shape[0]
    p = X.shape[1]  
    x_bw = util.meddistance(X, subsample = 1000)**2
    kx = kernel.KGauss(x_bw)
    if discrete_output is not None:
        freq_dict1, freq_dict2 = discrete_output
        ky = KDiscrete(freq_dict1, freq_dict2)
    else:
        y_bw = util.meddistance(Y[:, np.newaxis], subsample = 1000)**2
        ky = kernel.KGauss(y_bw)
    
    hsic_H = hsic.HSIC_U(kx, ky)    
    def one_calc(i):
        return hsic_H.compute(X[:,i,np.newaxis], Y[:,np.newaxis])
    par = Parallel(n_jobs = -1)
    res = par(delayed(one_calc)(i) for i in range(p))
    return np.array(res)
Пример #5
0
def estimate_H_unbiased(X, Y, discrete_output = None):
    """Estimating H with unbiased HSIC-estimator"""
    assert Y.shape[0] == X.shape[0]
    p = X.shape[1]
    # Creating X- and Y-kernels
    x_bw = util.meddistance(X, subsample = 1000)**2 # bandwith of Gaussian kernel
    kx = kernel.KGauss(x_bw)
    if discrete_output is not None:
        freq_dict1, freq_dict2 = discrete_output
        ky = KDiscrete(freq_dict1, freq_dict2)
    else:
        y_bw = util.meddistance(Y[:, np.newaxis], subsample = 1000)**2
        ky = kernel.KGauss(y_bw)
    # H estimation
    hsic_H = hsic.HSIC_U(kx, ky)
    H = np.zeros(p)
    for i in range(p):
        H[i] = hsic_H.compute(X[:,i,np.newaxis], Y[:,np.newaxis])
    return H
Пример #6
0
def estimate_M_unbiased(X):
    """Estimating M with unbiased HSIC-estimator"""
    p = X.shape[1]
    x_bw = util.meddistance(X, subsample = 1000)**2
    kx = kernel.KGauss(x_bw)
    hsic_M = hsic.HSIC_U(kx, kx)
    M_true = np.zeros((p, p))
    for i in range(p):
        for j in range(i+1):
            M_true[i, j] = hsic_M.compute(X[:,i,np.newaxis], X[:, j, np.newaxis])
            M_true[j, i] = M_true[i, j] # due to symmetry
    M = nearestPD(M_true) # positive definite approximation
    return M_true, M
Пример #7
0
def one_trial(i, n_samples, algsel, problem, n_select, hsic_e, params):
    p, r = problem.sample(n_samples, i)

    p_bw = util.meddistance(p, subsample=1000)**2
    r_bw = util.meddistance(r, subsample=1000)**2
    hsic_e = hsic_e(kernel.KGauss(p_bw), kernel.KGauss(r_bw),
                    params(n_samples))

    feat_select = algsel(hsic_e, params=True)

    results = feat_select.test(p, r, args=n_select, seed=i)

    ## True selected features.
    if results['sel_vars'].shape[0] > 1:
        true = problem.is_true(results['sel_vars'])
        n_true = np.sum(true)
        fpr = np.sum(results['h0_rejs'][np.logical_not(true)]) / max(
            n_select - n_true, 1)
        tpr = np.sum(results['h0_rejs'][true]) / max(n_true, 1)
    else:
        tpr, fpr = 0, 0
    logging.debug("TPR is :{0:.3f}  FPR is :{1:.3f}".format(tpr, fpr))
    return tpr, fpr
Пример #8
0
def estimate_M(X, estimator, B, ratio):
    """Estimating M with Block or incomplete U-statistics estimator
    :param B: Block size
    :param ratio: size of incomplete U-statistics estimator
    """
    p = X.shape[1]
    x_bw = util.meddistance(X, subsample = 1000)**2
    kx = kernel.KGauss(x_bw)
    if estimator == 'inc':
        hsic_M = hsic.HSIC_Inc(kx, kx, ratio = ratio)
    else: # 'block'
        hsic_M = hsic.HSIC_Block(kx, kx, bsize = B)

    M_true = np.zeros((p, p))
    for i in range(p):
        for j in range(i+1):
            M_true[i, j] = np.mean(hsic_M.estimates(X[:, i, np.newaxis], X[:, j, np.newaxis]))
            M_true[j, i] = M_true[i, j]
    M = nearestPD(M_true) # positive definite approximation
    return M_true, M
Пример #9
0
def estimate_M_unbiased_parallel(X):
    """Parallelised estimation of M with unbiased HSIC-estimator"""
    p = X.shape[1]
    x_bw = util.meddistance(X, subsample = 1000)**2
    kx = kernel.KGauss(x_bw)
    hsic_M = hsic.HSIC_U(kx, kx)
    M_true = np.zeros((p, p))
    
    def one_calc(i, j):
        return hsic_M.compute(X[:,i,np.newaxis], X[:, j, np.newaxis])
    
    par = Parallel(n_jobs = -1)
    res = par(delayed(one_calc)(i, j) for i in range(p) for j in range(i+1))
    sp = 0
    for i in range(p):
        for j in range(i+1):
            M_true[i, j] = M_true[j, i] = res[sp + j]
        sp += i+1
    M = nearestPD(M_true) # positive definite approximation
    return M_true, M
Пример #10
0
def one_trial(i, n_samples, algsel, problem, n_select, mmd_e):
    p, r = problem.sample(n_samples, i)

    bw = util.meddistance(np.vstack((p, r)), subsample=1000)**2
    mmd_u = mmd_e(kernel.KGauss(bw))

    feat_select = algsel(mmd_u)

    results = feat_select.test(p, r, args=n_select, seed=i)

    ## True selected features.
    if results['sel_vars'].shape[0] > 1:
        true = problem.is_true(results['sel_vars'])
        n_true = np.sum(true)
        fpr = np.sum(results['h0_rejs'][np.logical_not(true)]) / max(
            n_select - n_true, 1)
        tpr = np.sum(results['h0_rejs'][true]) / max(n_true, 1)
    else:
        tpr, fpr = 0, 0
    logging.debug("TPR is :{0:.3f}  FPR is :{1:.3f}".format(tpr, fpr))
    return tpr, fpr
Пример #11
0
    def sel_inf(self,
                X,
                Y,
                inf_type,
                alpha,
                niv,
                H0=None,
                M0=None,
                i=None,
                unbiased_parallel=False,
                n_jobs=20):
        """Post-selection inference
        :param X, Y: covariate and response data
        :param inf_type: one-sided hypothesis testing or two-sided confidence interval calculation
        :param alpha: level 1-alpha
        :param niv: number of important variables, used for reporting results
        H0, M0, i, unbiased_parallel and n_jobs are not used
        """
        assert inf_type == 'test'
        p = X.shape[1]
        # Initialising kernels
        x_bw = util.meddistance(X, subsample=1000)**2
        kx = kernel.KGauss(x_bw)
        if self.discrete_output:
            values, counts = np.unique(Y, return_counts=True)
            freq_dict = dict(zip(values, counts))
            ky = KDiscrete(freq_dict, freq_dict)
        else:
            y_bw = util.meddistance(Y[:, np.newaxis], subsample=1000)**2
            ky = kernel.KGauss(y_bw)

        if self.estimator == 'inc':
            hsic_H = hsic.HSIC_Inc(kx, ky, ratio=self.l)
        else:  # 'block'
            hsic_H = hsic.HSIC_Block(kx, ky, bsize=self.B)

        if self.poly:
            feat_select = PolySel(hsic_H)
        else:  # multi
            feat_select = MultiSel(hsic_H)

        # Behaviour for evaluation of power w.r.t. first feature
        if self.only_evaluate_first:
            params = hsic_H.compute(X, Y[:, np.newaxis])
            sel_vars = np.argpartition(params, -self.n_select,
                                       axis=0)[-self.n_select:]
            # only continue if the first feature was selected
            if 0 in sel_vars:
                results = feat_select.test(X,
                                           Y[:, np.newaxis],
                                           args=self.n_select,
                                           alpha=alpha)
                sel_vars = results['sel_vars']
                h0_rejs = results['h0_rejs']
            else:
                # fake values
                sel_vars = np.arange(p - self.n_select, p)
                h0_rejs = np.array([self.n_select - 1])
        # Regular behaviour
        else:
            results = feat_select.test(X,
                                       Y[:, np.newaxis],
                                       args=self.n_select,
                                       alpha=alpha)
            sel_vars = results['sel_vars']
            h0_rejs = results['h0_rejs']

        # Reporting
        ind_sc_np = None
        ind_sel_np = np.zeros(p)
        ind_sel_np[sel_vars] = 1

        ind_h0_rej = np.zeros(p)
        ind_h0_rej[sel_vars[h0_rejs]] = 1
        ind_h0_rej = {'H': ind_h0_rej}

        ind_h0_rej_true = np.zeros(p)
        ind_h0_rej_true[sel_vars] = 1
        ind_h0_rej_true[niv:] = 0
        ind_h0_rej_true = {'H': ind_h0_rej_true}

        p_values = -np.ones(p)
        # p-values not provided for Poly
        # p-values not of interest for evaluation of empirical power
        if not self.poly and not self.only_evaluate_first:
            p_values[sel_vars] = results['pvals']
        p_values = {'H': p_values}

        inf_res = sim.Inference_Result(p, ind_sc_np, ind_sel_np, ind_h0_rej,
                                       ind_h0_rej_true, p_values, None)
        return inf_res