Exemplo n.º 1
0
def test_HMM_basic():
    """
    Test whether the HMM knockoff generation function does not crash
    """
    p = 50
    K = 4
    M = 5
    n = 100
    pInit, Q, pEmit = generate_HMM(p, K, M)
    modelX = models.HMM(pInit, Q, pEmit)
    X = modelX.sample(n)
    knockoffs = knockoffHMM(pInit, Q, pEmit, seed=123)
    Xk = knockoffs.sample(X)
    groups = np.arange(p)
    knockoffs_g = knockoffHMM(pInit, Q, pEmit, groups=groups, seed=123)
    Xk_g = knockoffs_g.sample(X)
    assert np.array_equal(Xk,
                          Xk_g), "Knockoffs with trivial groups do not match"
    assert np.isfinite(Xk).all(), "Knockoffs are not finite"
Exemplo n.º 2
0
def test_haplotypes_hmm():
    """
    Test whether specialized haplotype knockoff algorithm agrees with special case
    """
    p = 10
    K = 5
    M = 2
    n_train = 1000
    n_test = 100000
    pInit, Q, pEmit = generate_HMM(p, K, M)
    modelX = models.HMM(pInit, Q, pEmit)
    X = modelX.sample(n_train)
    _, Xfp_file = tempfile.mkstemp()
    fp.writeXtoInp(X, Xfp_file, phased=True)
    fastphase = "fastphase"  # Name of fastPhase executable
    _, out_path = tempfile.mkstemp()
    fp.runFastPhase(Xfp_file,
                    out_path,
                    fastphase=fastphase,
                    phased=True,
                    K=5,
                    numit=25)
    r_file = out_path + "_rhat.txt"
    alpha_file = out_path + "_alphahat.txt"
    theta_file = out_path + "_thetahat.txt"
    char_file = out_path + "_origchars"
    groups = np.repeat(np.arange(p), 3)[:p]
    hmm_compact = fp.loadHMM(r_file, alpha_file, theta_file, char_file)
    hmm = fp.loadHMM(r_file,
                     alpha_file,
                     theta_file,
                     char_file,
                     compact=False,
                     phased=True)
    knockoffs = knockoffHMM(hmm["pInit"],
                            hmm["Q"],
                            hmm["pEmit"],
                            groups=groups,
                            seed=123)
    knockoffs_hap = knockoffHaplotypes(hmm_compact["r"], hmm_compact["alpha"], hmm_compact["theta"], \
                                       groups=groups, seed=123)
    hmm_hat = fp.loadHMM(r_file,
                         alpha_file,
                         theta_file,
                         char_file,
                         compact=False,
                         phased=True)
    Xk = knockoffs.sample(X)
    Xk_compact = knockoffs_hap.sample(X)
    assert np.array_equal(
        Xk, Xk_compact), "Knockoffs with trivial groups do not match"
Exemplo n.º 3
0
def test_HMM():
    """
    Test whether the HMM knockoff generation is correct
    """
    p = 10
    K = 4
    M = 5
    n = 100000
    pInit, Q, pEmit = generate_HMM(p, K, M)
    modelX = models.HMM(pInit, Q, pEmit)
    X = modelX.sample(n)
    knockoffs = knockoffHMM(pInit, Q, pEmit, seed=123)
    Xk = knockoffs.sample(X)
    verify_exchangeability(X, Xk)
Exemplo n.º 4
0
 def _estimate(self, Xfp_file='./X.inp'):
     fp.writeX(self.X, Xfp_file)
     path_to_fp = "/home/roquero/Software/fastPHASE"  # Relative path to the fastPhase executable
     out_path = "./example"  # Prefix to temporary output files produced by fastPhase
     fp.runFastPhase(path_to_fp,
                     Xfp_file,
                     out_path,
                     K=self.hidden_states,
                     numit=self.numit)
     r_file = out_path + "_rhat.txt"
     alpha_file = out_path + "_alphahat.txt"
     theta_file = out_path + "_thetahat.txt"
     self.hmm = fp.loadFit(r_file, theta_file, alpha_file, self.X[0, :])
     self.knockoffHMM = knockoffHMM(self.hmm["pInit"], self.hmm["Q"],
                                    self.hmm["pEmit"])
Exemplo n.º 5
0
def test_HMM_groups():
    """
    Test whether the HMM knockoff generation is correct
    """
    p = 10
    K = 4
    M = 5
    n = 100000
    pInit, Q, pEmit = generate_HMM(p, K, M)
    modelX = models.HMM(pInit, Q, pEmit)
    X = modelX.sample(n)
    groups = np.repeat(np.arange(p), 3)[:p]
    knockoffs = knockoffHMM(pInit, Q, pEmit, groups=groups, seed=123)
    Xk = knockoffs.sample(X)
    verify_exchangeability(X, Xk, groups=groups)
Exemplo n.º 6
0
pEmit = np.zeros((p,M,K))
gamma = np.random.uniform(low=0, high=10, size=p)
for j in range(p-1):    
    Q[j,:,:] = np.resize(np.random.uniform(size=K*K),(K,K))
    Q[j,:,:] += np.diag([gamma[j]]*K) 
    Q[j,:,:] /= np.sum(Q[j,:,:],1)[:,None]
for j in range(p):
    pEmit[j,:,:] = np.resize(np.random.uniform(size=M*K),(M,K))    
    pEmit[j,:,:] += np.diag([gamma[j]]*K) 
    pEmit[j,:,:] /= np.sum(pEmit[j,:,:],0)
pInit = np.zeros((K,))
pInit[0] = 1

# Sample X
n=10000
modelX = models.HMM(pInit, Q, pEmit)
X = modelX.sample(n)

# Generate the knockoffs
knockoffs = knockoffHMM(pInit, Q, pEmit)
Xk = knockoffs.sample(X)

# Plot paths
util.plotPaths(X,Xk)

# Compare original variables and knockoffs 
util.compare_marginals(X,Xk)
util.compare_cons_corr(X,Xk)
util.compare_cross_corr(X,Xk)
util.compare_cross_corr(X,Xk,dist=0)
Exemplo n.º 7
0
def make_knockoff(chromosome=None, grouped_by_chromosome=None, df_SNP=None,
                  df_geno_experiment=None, df_geno_ensembl=None,
                  SNP_to_wild_type=None, cache_dir=None, path_to_fp=None,
                  em_iterations=25, random_seed=123):
    # assert chromosome!=None and grouped_by_chromosome!=None and df_SNP!=None
    assert chromosome is not None
    assert grouped_by_chromosome is not None
    assert df_SNP is not None

    logger.debug("################")
    logger.debug("Chromosome %2d #" % chromosome)
    logger.debug("################")

    num_experiment_people = len(df_geno_experiment)
    num_ensembl_people = len(df_geno_ensembl)

    indices = grouped_by_chromosome.groups[chromosome]
    df_SNP_chromo = df_SNP.iloc[indices].sort_values('chromosome_position')
    SNPs_on_chromosome = df_SNP_chromo['SNP'].values

    X_experiment = np.empty((num_experiment_people, len(SNPs_on_chromosome)))
    X_ensembl = np.empty((num_ensembl_people, len(SNPs_on_chromosome)))
    for X, df in [
            (X_experiment, df_geno_experiment),
            (X_ensembl, df_geno_ensembl)]:

        for j, SNP in enumerate(SNPs_on_chromosome):
            X[:, j] = utils.genotype_to_nonwild_type_count(
                df[SNP].values, SNP_to_wild_type[SNP])

    out_path = '%s/chrom_%d' % (cache_dir, chromosome)

    # If all relevant files are found in cache, skip EM recomputation; otherwise,
    # redo the whole thing.
    target_file_suffix_list = [
        'alphahat.txt', 'finallikelihoods', 'origchars', 'rhat.txt', 'thetahat.txt']
    already_in_cache = True
    for suffix in target_file_suffix_list:
        target_path = os.path.join(
            cache_dir, 'chrom_%d_%s' % (chromosome, suffix))
        if not os.path.exists(target_path):
            already_in_cache = False
            break
    if already_in_cache:
        logger.debug("Found chrom %d HMM in cache" % chromosome)
    else:
        # Write array to file
        Xfp_file = '%s/X_%d.inp' % (cache_dir, chromosome)
        fp.writeX(X_ensembl, Xfp_file)

        # Run fastPhase on data (which runs EM)
        fp.runFastPhase(path_to_fp, Xfp_file, out_path,
                        K=12, numit=em_iterations)

    # Read in fastPhase results (i.e., HMM parameters) from file:
    r_file = out_path + "_rhat.txt"
    alpha_file = out_path + "_alphahat.txt"
    theta_file = out_path + "_thetahat.txt"
    # Why is X_ensembl[0, :] in the function arguments below?
    hmm = fp.loadFit(r_file, theta_file, alpha_file, X_ensembl[0, :])

    # Actually produce the knockoffs
    knockoffs = knockoffHMM(hmm["pInit"], hmm["Q"], hmm[
                            "pEmit"], seed=random_seed)
    X_knockoffs = knockoffs.sample(X_experiment)

    return(X_knockoffs, X_experiment, SNPs_on_chromosome)