Exemplo n.º 1
0
def calculate_ld_tables(input_genotype_file, chrom_i, local_ld_hdf5_file, ld_radius,
                        min_r2=0.2, maf_thres=0.01, indiv_filter=None,
                        snp_filter=None, return_void=True, verbose=True):
    """
    Calculate the LD tables for the given radius, and store in the given file.
    """
    if not os.path.isfile(local_ld_hdf5_file):
        h5f = h5py.File(input_genotype_file)
        
        print 'Calculating LD information for chromosome %d w. radius %d' % (chrom_i, ld_radius)

        g_dict = kgenome.get_genotype_data(h5f, chrom_i, maf_thres, indiv_filter=indiv_filter,
                        snp_filter=snp_filter, randomize_sign=False, snps_signs=None)
          
        ld_dict = get_ld_table(g_dict['norm_snps'], ld_radius=ld_radius, min_r2=min_r2, verbose=verbose)
        ld_dict['avg_ld_score'] = sp.mean(ld_dict['ld_scores'])
                    
        print 'Done calculating the LD table and LD score, writing to file:', local_ld_hdf5_file
        oh5f = h5py.File(local_ld_hdf5_file, 'w')
        hu.dict_to_hdf5(ld_dict, oh5f)
        oh5f.close()
        print 'LD information is now stored.'
    else:
        print 'Loading LD information from file: %s' % local_ld_hdf5_file
        oh5f = h5py.File(local_ld_hdf5_file, 'r')
        ld_dict = hu.hdf5_to_dict(oh5f)
        oh5f.close()
    
    if not return_void:
        return ld_dict
Exemplo n.º 2
0
def calc_pc_snp_weights(input_file='/project/PCMA/faststorage/1_DATA/1k_genomes/1K_genomes_phase3_EUR_unrelated.hdf5',
                        pc_file='/project/PCMA/faststorage/1_DATA/1k_genomes/1kgenomes_kinship_pca_f0.95.hdf5',
                        out_file='/project/PCMA/faststorage/1_DATA/1k_genomes/pc_snp_weights_top20.hdf5',
                        snp_filter_frac=1, maf_thres=0.01, num_pcs=20):
    pcs_h5f = h5py.File(pc_file)
    
    print 'Loading Genotype from '
    in_h5f = h5py.File(input_file)
    out_h5f = h5py.File(out_file, 'w')
#     eur_filter = in_h5f['indivs']['continent'][...] == 'EUR'
#     num_indivs = sp.sum(eur_filter)
    indiv_ids = in_h5f['indiv_ids'][...] 
    indiv_filter = None
    assert len(sp.unique(indiv_ids)) == len(indiv_ids)
    num_indivs = len(indiv_ids) 
    

    for chrom in range(1, 23):
        print 'Working on Chromosome %d' % chrom
        chrom_str = 'chr%d' % chrom
        
        snp_filter = None
        if snp_filter_frac < 1:
            snp_filter = sp.random.random(len(in_h5f[chrom_str]['snps'])) < snp_filter_frac

        g_dict = kgenome.get_genotype_data(in_h5f, chrom, maf_thres, indiv_filter=indiv_filter,
                    snp_filter=snp_filter, randomize_sign=True, snps_signs=None, return_snps_info=True)
        
        norm_snps = g_dict['norm_snps']
        snp_ids = g_dict['snp_ids']
        
        evecs = pcs_h5f[chrom_str]['evecs_leave_one_out'][...]
        evals = pcs_h5f[chrom_str]['evals_leave_one_out'][...]
        sort_indices = sp.argsort(evals,)[::-1]
        ordered_evals = evals[sort_indices]
        pcs_var_expl = sp.array(ordered_evals / sp.sum(ordered_evals), dtype='double')
        pcs = evecs[:, sort_indices]
        pcs = pcs[:, :num_pcs]
        ordered_evals = ordered_evals[:num_pcs]
        norm_pcs = pcs - sp.mean(pcs, axis=0)
        pcs_std = sp.std(norm_pcs, axis=0)æ
        norm_pcs = norm_pcs / pcs_std
        
        cg = out_h5f.create_group(chrom_str)
        cg.create_dataset('snp_pc_weights', data=sp.dot(norm_snps, norm_pcs) / num_indivs) 
        cg.create_dataset('pcs_var_expl', data=pcs_var_expl)
        cg.create_dataset('snp_ids', data=snp_ids)
        out_h5f.flush()
    in_h5f.close()
    out_h5f.close()
            
Exemplo n.º 3
0
def calculate_ld_tables(input_genotype_file,
                        chrom_i,
                        local_ld_hdf5_file,
                        ld_radius,
                        min_r2=0.2,
                        maf_thres=0.01,
                        indiv_filter=None,
                        snp_filter=None,
                        return_void=True,
                        verbose=True):
    """
    Calculate the LD tables for the given radius, and store in the given file.
    """
    if not os.path.isfile(local_ld_hdf5_file):
        h5f = h5py.File(input_genotype_file)

        print 'Calculating LD information for chromosome %d w. radius %d' % (
            chrom_i, ld_radius)

        g_dict = kgenome.get_genotype_data(h5f,
                                           chrom_i,
                                           maf_thres,
                                           indiv_filter=indiv_filter,
                                           snp_filter=snp_filter,
                                           randomize_sign=False,
                                           snps_signs=None)

        ld_dict = get_ld_table(g_dict['norm_snps'],
                               ld_radius=ld_radius,
                               min_r2=min_r2,
                               verbose=verbose)
        ld_dict['avg_ld_score'] = sp.mean(ld_dict['ld_scores'])

        print 'Done calculating the LD table and LD score, writing to file:', local_ld_hdf5_file
        oh5f = h5py.File(local_ld_hdf5_file, 'w')
        hu.dict_to_hdf5(ld_dict, oh5f)
        oh5f.close()
        print 'LD information is now stored.'
    else:
        print 'Loading LD information from file: %s' % local_ld_hdf5_file
        oh5f = h5py.File(local_ld_hdf5_file, 'r')
        ld_dict = hu.hdf5_to_dict(oh5f)
        oh5f.close()

    if not return_void:
        return ld_dict
Exemplo n.º 4
0
def calc_pc_snp_weights(
    input_file="/project/PCMA/faststorage/1_DATA/1k_genomes/1K_genomes_phase3_EUR_unrelated.hdf5",
    pc_file="/project/PCMA/faststorage/1_DATA/1k_genomes/1kgenomes_kinship_pca_f0.95.hdf5",
    out_file="/project/PCMA/faststorage/1_DATA/1k_genomes/pc_snp_weights_top20.hdf5",
    snp_filter_frac=1,
    maf_thres=0.01,
    num_pcs=20,
):
    pcs_h5f = h5py.File(pc_file)

    print "Loading Genotype from "
    in_h5f = h5py.File(input_file)
    out_h5f = h5py.File(out_file, "w")
    #     eur_filter = in_h5f['indivs']['continent'][...] == 'EUR'
    #     num_indivs = sp.sum(eur_filter)
    indiv_ids = in_h5f["indiv_ids"][...]
    indiv_filter = None
    assert len(sp.unique(indiv_ids)) == len(indiv_ids)
    num_indivs = len(indiv_ids)

    for chrom in range(1, 23):
        print "Working on Chromosome %d" % chrom
        chrom_str = "chr%d" % chrom

        snp_filter = None
        if snp_filter_frac < 1:
            snp_filter = sp.random.random(len(in_h5f[chrom_str]["snps"])) < snp_filter_frac

        g_dict = kgenome.get_genotype_data(
            in_h5f,
            chrom,
            maf_thres,
            indiv_filter=indiv_filter,
            snp_filter=snp_filter,
            randomize_sign=True,
            snps_signs=None,
            return_snps_info=True,
        )

        norm_snps = g_dict["norm_snps"]
        snp_ids = g_dict["snp_ids"]

        evecs = pcs_h5f[chrom_str]["evecs_leave_one_out"][...]
        evals = pcs_h5f[chrom_str]["evals_leave_one_out"][...]
        sort_indices = sp.argsort(evals)[::-1]
        ordered_evals = evals[sort_indices]
        pcs_var_expl = sp.array(ordered_evals / sp.sum(ordered_evals), dtype="double")
        pcs = evecs[:, sort_indices]
        pcs = pcs[:, :num_pcs]
        ordered_evals = ordered_evals[:num_pcs]
        norm_pcs = pcs - sp.mean(pcs, axis=0)
        pcs_std = sp.std(norm_pcs, axis=0)
        norm_pcs = norm_pcs / pcs_std

        cg = out_h5f.create_group(chrom_str)
        cg.create_dataset("snp_pc_weights", data=sp.dot(norm_snps, norm_pcs) / num_indivs)
        cg.create_dataset("pcs_var_expl", data=pcs_var_expl)
        cg.create_dataset("snp_ids", data=snp_ids)
        out_h5f.flush()
    in_h5f.close()
    out_h5f.close()
Exemplo n.º 5
0
def generate_1k_LD_scores(input_genotype_file,
                          chrom_snp_trans_mats,
                          maf_thres=0.01,
                          ld_radius=200,
                          debug_filter_frac=0.01,
                          indiv_filter=None,
                          snp_filter=None):
    """
    Generates 1k genomes LD scores and stores in the given file
    """

    chrom_ld_scores_dict = {}
    ld_score_sum = 0
    struct_adj_ld_score_sum = 0
    num_snps = 0
    print 'Calculating LD information w. radius %d' % ld_radius
    in_h5f = h5py.File(input_genotype_file)

    print 'Calculating local LD'
    for chrom in range(1, 23):
        print 'Working on Chromosome %d' % chrom
        chrom_str = 'chr%d' % chrom

        print 'Loading SNPs'
        g_dict = kgenome.get_genotype_data(in_h5f,
                                           chrom,
                                           maf_thres,
                                           indiv_filter=indiv_filter,
                                           snp_filter=snp_filter,
                                           randomize_sign=False,
                                           snps_signs=None,
                                           return_snps_info=True,
                                           debug_filter_frac=debug_filter_frac)

        norm_snps = g_dict['norm_snps']

        ret_dict = ld.get_ld_scores(norm_snps, ld_radius=ld_radius)
        avg_ld_score = sp.mean(ret_dict['ld_scores'])
        g_dict['ld_scores'] = ret_dict['ld_scores']
        g_dict['avg_ld_score'] = avg_ld_score
        ld_score_sum += sp.sum(ret_dict['ld_scores'])

        print 'Un-adjusted average LD score was: %0.3f' % avg_ld_score

        if chrom_snp_trans_mats is not None:
            snp_trans_mat = chrom_snp_trans_mats[chrom_str]
            norm_snps = sp.dot(norm_snps, snp_trans_mat.T)

            # Need to re-normalize?
            snp_means = sp.mean(norm_snps, 1)
            snp_means.shape = (len(snp_means), 1)
            snp_stds = sp.std(norm_snps, 1)
            snp_stds.shape = (len(snp_stds), 1)
            norm_snps = sp.array((norm_snps - snp_means) / snp_stds)

            ret_dict = ld.get_ld_scores(norm_snps, ld_radius=ld_radius)

            avg_ld_score = sp.mean(ret_dict['ld_scores'])
            print 'Pop-structure adjusted average LD score was: %0.3f' % avg_ld_score

            g_dict['struct_adj_ld_scores'] = ret_dict['ld_scores']
            g_dict['avg_struct_adj_ld_score'] = avg_ld_score
            struct_adj_ld_score_sum += sp.sum(ret_dict['ld_scores'])

        del g_dict['norm_snps']
        del g_dict['snp_means']
        del g_dict['snp_stds']
        chrom_ld_scores_dict[chrom_str] = g_dict
        num_snps += len(norm_snps)

    avg_gw_ld_score = ld_score_sum / float(num_snps)
    avg_gw_struct_adj_ld_score = ld_score_sum / float(num_snps)
    ld_scores_dict = {
        'avg_gw_ld_score': avg_gw_ld_score,
        'avg_gw_struct_adj_ld_score': avg_gw_struct_adj_ld_score,
        'chrom_dict': chrom_ld_scores_dict
    }

    print 'Done calculating the LD table and LD scores.'
    return ld_scores_dict
Exemplo n.º 6
0
def generate_1k_LD_scores(
    input_genotype_file,
    chrom_snp_trans_mats,
    maf_thres=0.01,
    ld_radius=200,
    debug_filter_frac=0.01,
    indiv_filter=None,
    snp_filter=None,
):
    """
    Generates 1k genomes LD scores and stores in the given file
    """

    chrom_ld_scores_dict = {}
    ld_score_sum = 0
    struct_adj_ld_score_sum = 0
    num_snps = 0
    print "Calculating LD information w. radius %d" % ld_radius
    in_h5f = h5py.File(input_genotype_file)

    print "Calculating local LD"
    for chrom in range(1, 23):
        print "Working on Chromosome %d" % chrom
        chrom_str = "chr%d" % chrom

        print "Loading SNPs"
        g_dict = kgenome.get_genotype_data(
            in_h5f,
            chrom,
            maf_thres,
            indiv_filter=indiv_filter,
            snp_filter=snp_filter,
            randomize_sign=False,
            snps_signs=None,
            return_snps_info=True,
            debug_filter_frac=debug_filter_frac,
        )

        norm_snps = g_dict["norm_snps"]

        ret_dict = ld.get_ld_scores(norm_snps, ld_radius=ld_radius)
        avg_ld_score = sp.mean(ret_dict["ld_scores"])
        g_dict["ld_scores"] = ret_dict["ld_scores"]
        g_dict["avg_ld_score"] = avg_ld_score
        ld_score_sum += sp.sum(ret_dict["ld_scores"])

        print "Un-adjusted average LD score was: %0.3f" % avg_ld_score

        if chrom_snp_trans_mats is not None:
            snp_trans_mat = chrom_snp_trans_mats[chrom_str]
            norm_snps = sp.dot(norm_snps, snp_trans_mat.T)

            # Need to re-normalize?
            snp_means = sp.mean(norm_snps, 1)
            snp_means.shape = (len(snp_means), 1)
            snp_stds = sp.std(norm_snps, 1)
            snp_stds.shape = (len(snp_stds), 1)
            norm_snps = sp.array((norm_snps - snp_means) / snp_stds)

            ret_dict = ld.get_ld_scores(norm_snps, ld_radius=ld_radius)

            avg_ld_score = sp.mean(ret_dict["ld_scores"])
            print "Pop-structure adjusted average LD score was: %0.3f" % avg_ld_score

            g_dict["struct_adj_ld_scores"] = ret_dict["ld_scores"]
            g_dict["avg_struct_adj_ld_score"] = avg_ld_score
            struct_adj_ld_score_sum += sp.sum(ret_dict["ld_scores"])

        del g_dict["norm_snps"]
        del g_dict["snp_means"]
        del g_dict["snp_stds"]
        chrom_ld_scores_dict[chrom_str] = g_dict
        num_snps += len(norm_snps)

    avg_gw_ld_score = ld_score_sum / float(num_snps)
    avg_gw_struct_adj_ld_score = ld_score_sum / float(num_snps)
    ld_scores_dict = {
        "avg_gw_ld_score": avg_gw_ld_score,
        "avg_gw_struct_adj_ld_score": avg_gw_struct_adj_ld_score,
        "chrom_dict": chrom_ld_scores_dict,
    }

    print "Done calculating the LD table and LD scores."
    return ld_scores_dict