示例#1
0
def export_ldscore(ht, pop):
    hm3_snps = hl.read_table(get_hm3_snplist_path(pop))

    ht = ht.select(CHR=ht.locus.contig,
                   SNP=hl.variant_str(ht.locus, ht.alleles),
                   RSID=ht.rsid,
                   BP=ht.locus.position,
                   L2=ht.ld_score,
                   MAF=0.5 - hl.abs(0.5 - ht.AF))
    count = ht.aggregate(
        hl.struct(M=hl.agg.count(), M_5_50=hl.agg.sum(ht.MAF > 0.05)))
    ht = ht.filter(hl.is_defined(hm3_snps[ht.locus, ht.alleles]))
    ht = ht.key_by().drop('locus', 'alleles', 'MAF')

    with hadoop_open(get_ld_score_flat_file_path(pop, extension='M'),
                     'w') as f:
        f.write(f'{count.M}\n')
    with hadoop_open(get_ld_score_flat_file_path(pop, extension='M_5_50'),
                     'w') as f:
        f.write(f'{count.M_5_50}\n')

    # LD score with variant ids
    ht.drop('RSID').export(get_ld_score_flat_file_path(pop))
    # with rsids
    ht.transmute(SNP=ht.RSID).export(
        get_ld_score_flat_file_path(pop, rsid=True))
示例#2
0
def run_gwas(vcf_file, phenotypes_file, output_file):
    table = hl.import_table(phenotypes_file, impute=True).key_by('Sample')

    hl.import_vcf(vcf_file).write('tmp.mt')
    mt = hl.read_matrix_table('tmp.mt')

    mt = mt.annotate_cols(pheno=table[mt.s])

    downsampled = mt.sample_rows(0.01, seed=11223344)
    eigenvalues, pcs, _ = hl.hwe_normalized_pca(downsampled.GT)

    mt = mt.annotate_cols(scores=pcs[mt.s].scores)

    gwas = hl.linear_regression_rows(
        y=mt.pheno.CaffeineConsumption,
        x=mt.GT.n_alt_alleles(),
        covariates=[1.0, mt.scores[0], mt.scores[1], mt.scores[2]])

    gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles),
                       P=gwas.p_value)
    gwas = gwas.key_by(gwas.SNP)
    gwas = gwas.select(gwas.P)
    gwas.export(f'{output_file}.assoc', header=True)

    hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)
def run_pca(prune_out: hl.MatrixTable,
            pca_prefix: str,
            overwrite: bool = False):
    """
    Run PCA on a dataset
    :param mt: dataset to run PCA on
    :param pca_prefix: directory and filename prefix for where to put PCA output
    :return:
    """

    mt = hl.read_matrix_table(prune_out)

    pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(
        mt.GT, k=20, compute_loadings=True)
    pca_mt = mt.annotate_rows(pca_af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2)
    pca_loadings = pca_loadings.annotate(
        pca_af=pca_mt.rows()[pca_loadings.key].pca_af)

    pca_scores.write(pca_prefix + 'scores.ht', overwrite)
    pca_scores = hl.read_table(pca_prefix + 'scores.ht')
    pca_scores = pca_scores.transmute(
        **{f'PC{i}': pca_scores.scores[i - 1]
           for i in range(1, 21)})
    pca_scores.export(pca_prefix + 'scores.txt.bgz')  # individual-level PCs

    pca_loadings.export(pca_prefix + 'loadings.txt.bgz')

    pca_loadings.write(pca_prefix + 'loadings.ht', overwrite)  # PCA loadings

    #export loadings in plink format
    ht = hl.read_table(pca_prefix + 'loadings.ht')
    ht = ht.key_by()
    ht_loadings = ht.select(
        ID=hl.variant_str(ht.locus, ht.alleles),
        ALT=ht.alleles[1],
        **{f"PC{i}": ht.loadings[i - 1]
           for i in range(1, 21)})
    ht_afreq = ht.select(
        **{
            "#ID": hl.variant_str(ht.locus, ht.alleles),
            "REF": ht.alleles[0],
            "ALT": ht.alleles[1],
            "ALT1_FREQ": ht.pca_af
        })
    ht_loadings.export(pca_prefix + 'loadings.plink.tsv')
    ht_afreq.export(pca_prefix + 'loadings.plink.afreq')
示例#4
0
文件: run_gwas.py 项目: saponas/hail
def run_gwas(vcf_file, phenotypes_file, output_file):
    table = hl.import_table(phenotypes_file, impute=True).key_by('Sample')

    hl.import_vcf(vcf_file).write('tmp.mt')
    mt = hl.read_matrix_table('tmp.mt')

    mt = mt.annotate_cols(pheno=table[mt.s])
    mt = hl.sample_qc(mt)
    mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4)
                        & (mt.sample_qc.call_rate >= 0.97))
    ab = mt.AD[1] / hl.sum(mt.AD)
    filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1))
                           | (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75))
                           | (mt.GT.is_hom_var() & (ab >= 0.9)))
    mt = mt.filter_entries(filter_condition_ab)
    mt = hl.variant_qc(mt)
    mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01)

    eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT)

    mt = mt.annotate_cols(scores=pcs[mt.s].scores)

    gwas = hl.linear_regression_rows(y=mt.pheno.CaffeineConsumption,
                                     x=mt.GT.n_alt_alleles(),
                                     covariates=[
                                         1.0, mt.pheno.isFemale, mt.scores[0],
                                         mt.scores[1], mt.scores[2]
                                     ])

    gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles),
                       P=gwas.p_value)
    gwas = gwas.key_by(gwas.SNP)
    gwas = gwas.select(gwas.P)
    gwas.export(f'{output_file}.assoc', header=True)

    hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)
BROWSER_GENE_RESULTS_TABLE = 'gs://dalio_bipolar_w1_w2_hail_02_browser/data/ht/browser_gene_results_table.ht'

ht = mt.rows().key_by('locus', 'alleles')
# Throw away everything except what we need for variant annotations

ht = ht.select(transcript_csq = ht.annotation.vep.transcript_consequences,
               worst_csq_for_variant_canonical = ht.annotation.vep.worst_csq_for_variant_canonical,
               csq_analysis = ht.annotation.consequence_category,
               csq_worst = ht.annotation.vep.most_severe_consequence,
               cadd = ht.annotation.cadd.PHRED_score,
               mpc = ht.annotation.mpc.MPC)

# Explode on consequence category
ht = ht.explode('transcript_csq', name='transcript_csq')

ht = ht.select(variant_id = hl.variant_str(ht.locus, ht.alleles),
               gene_id = ht.worst_csq_for_variant_canonical.gene_id,
               gene_name = ht.worst_csq_for_variant_canonical.gene_symbol,
               canonical_transcript_id = ht.worst_csq_for_variant_canonical.transcript_id,
               transcript_id = ht.transcript_csq.transcript_id,
               hgvsc_canonical = ht.worst_csq_for_variant_canonical.hgvsc,
               hgvsc = ht.transcript_csq.hgvsc,
               hgvsp_canonical = ht.worst_csq_for_variant_canonical.hgvsp,
               hgvsp = ht.transcript_csq.hgvsp,
               csq_analysis = ht.csq_analysis,
               csq_worst = ht.csq_worst,
               csq_canonical = ht.worst_csq_for_variant_canonical.most_severe_consequence,
               cadd = ht.cadd,
               mpc = ht.mpc,
               polyphen = ht.worst_csq_for_variant_canonical.polyphen_prediction).write(BROWSER_VARIANT_ANNOTATION_TABLE, overwrite=True)
示例#6
0
                       contig_recoding=recoding_dict)
    ### filter to pass variants and split_multi
    mt2 = mt.filter_rows(mt.filters.size() > 0, keep=False)
    mt2 = hl.split_multi_hts(mt2)

    #variant read counts of 3
    #at least one read in both forward and reverse orientations
    #remove  monomorphic variants
    mt3 = mt2.filter_entries(
        ((mt2.AD[1] < 2) | (mt2.F1R2[1] == 0) | (mt2.F2R1[1] == 0)),
        keep=False)
    mt3 = hl.variant_qc(mt3)
    mt3 = mt3.filter_rows(
        (mt3.variant_qc.AF[1] > 0) & (mt3.variant_qc.AF[1] < 1), keep=True)

    mt4 = mt3.annotate_rows(v = hl.variant_str(mt3.locus, mt3.alleles),\
         NumAltAlleles = hl.agg.max(mt3.GT.n_alt_alleles()), \
         VAF =hl.agg.explode(lambda x: hl.agg.mean(x), mt3.AF),\
         TLOD =mt3.info.TLOD[0], \
         GERMQ = mt3.info.GERMQ, \
         STR=mt3.info.STR,\
         AD_alt=hl.agg.mean(mt3.AD[1]),\
         AD_ref=hl.agg.mean(mt3.AD[0]))

    mt4 = mt4.annotate_entries(
        Binomial_Prob=hl.binom_test(mt4.AD[1], mt4.DP, 0.5, 'greater'))
    mt4 = mt4.key_rows_by("v")
    mt4 = mt4.drop('locus', 'alleles', 'qual', 'filters', 'variant_qc', 'GQ',
                   'PGT', 'PID', 'PL', 'PS', 'info', 'rsid', 'a_index',
                   'was_split')
    filt2 = mt4.count_rows()
示例#7
0
def hailthread(cond1, q, cond2, qcm, inputDir, outputDir, qaws_size):

    #Load id_conversion file
    #table_idconv=hl.import_table('id_conversion')

    #Load markers files
    #table_makers_pos=hl.import_table('800k_to_extract_indexed2.txt',delimiter=':',no_header=True,impute=True)
    #table_markers_all=hl.import_table('800k_to_extract_indexed_alleles_gt2.txt',delimiter=':',no_header=True,impute=True)

    #cut -f 1 -d',' 800k_to_extract_indexed2.txt > interval_table
    #awk -F':' '{print $1"\t"$2"\t"$2}' interval_table > interval_table2

    hl.init()
    cond1.acquire()
    while not an_item_is_available(q):
        #print("Thread hail to sleep")
        #time.sleep(300)
        print("Thread hail to wait")

        cond1.wait()

    file = get_an_available_item(q)
    print("Thread hail get item " + file)
    qaws_size = qaws_size - 1
    cond1.release()

    interval_table = hl.import_locus_intervals('interval_table2',
                                               reference_genome='GRCh38')

    while file != "END":

        fileParts = file.split("/")[-1]
        fileName = fileParts.replace(".vcf.gz", "").replace(".gvcf.gz", "")
        chrName = fileName.split("_")[-3]
        #myFNAL=fileName.split("\\.")
        #myTempId=myFNAL[0]
        #Load gVCF file
        #data=hl.import_vcf("/mnt/vol1/java/gel_test.vcf",force_bgz=True,reference_genome='GRCh38')
        #data=hl.import_vcf("/mnt/vol1/java/gel_mainProgramme_aggV2_chr10_129040437_131178399.vcf.gz",force_bgz=True,reference_genome='GRCh38')
        try:
            data = hl.import_vcf(inputDir + "/" + fileParts,
                                 force_bgz=True,
                                 reference_genome='GRCh38')
            #data=hl.import_vcf(file.replace("s3://","s3a://"),force_bgz=True,reference_genome='GRCh38')
            #Filters PASS
            if chrName != "chrY":
                data = data.filter_rows(data.filters.size() > 0, keep=False)
            #Multiallelic
            data = hl.split_multi_hts(data)
            #Join with markers
            data_filtered = data.filter_rows(
                hl.is_defined(interval_table[data.locus]))
            #Replace with 0s and 1s
            #Export
            #data_filtered_annot=data_filtered.annotate_entries(output=(data_filtered.GT.is_het()|data_filtered.GT.is_hom_var()))
            #Cambiamos la key para que contenga los alelos y no simplemente la posición
            variant_key = data_filtered.key_rows_by(variant=hl.variant_str(
                data_filtered.locus, data_filtered.alleles))
            #Exportamos el campo creado anteriormente y la cuenta del número de alelos ALT
            #variant_key.GT.n_alt_alleles().export(outputDir+"/"+fileName+".tsv")
            #Change to export GT and not only the number of alt alleles because sex chromosomes need to check GT
            variant_key.GT.export(outputDir + "/" + fileName + ".tsv")

            #Extract INFO fields

            data = hl.import_vcf(inputDir + "/" + fileParts,
                                 force_bgz=True,
                                 reference_genome='GRCh38',
                                 drop_samples=True)
            #Filters PASS
            if chrName != "chrY":
                data = data.filter_rows(data.filters.size() > 0, keep=False)
            #Multiallelic
            data = hl.split_multi_hts(data)
            #Join with markers
            data_filtered = data.filter_rows(
                hl.is_defined(interval_table[data.locus]))

            if chrName != "chrY":
                data_sr = data_filtered.select_rows(
                    data_filtered.info.medianDepthAll,
                    data_filtered.info.medianDepthNonMiss,
                    data_filtered.info.medianGQ,
                    data_filtered.info.missingness,
                    data_filtered.info.completeGTRatio,
                    data_filtered.info.ABratio, data_filtered.info.MendelSite,
                    data_filtered.info.AN, data_filtered.info.AC,
                    data_filtered.info.AC_Hom, data_filtered.info.AC_Het)

            else:
                data_sr = data_filtered.select_rows(data_filtered.info.AN,
                                                    data_filtered.info.AC,
                                                    data_filtered.info.AC_Hom,
                                                    data_filtered.info.AC_Het)

            ht = data_sr.make_table()
            ht.export(outputDir + "/" + fileName + "_INFO.tsv")
            os.system("sed -i 's/\[//g' " + outputDir + "/" + fileName +
                      "_INFO.tsv")
            os.system("sed -i 's/]//g' " + outputDir + "/" + fileName +
                      "_INFO.tsv")
            os.system("cat " + outputDir + "/" + fileName +
                      "_INFO.tsv | grep -v locus " + " >> " + outputDir +
                      "/INFO_" + chrName)

            cond2.acquire()
            print("Thread hail make item available " + fileName)
            make_an_item_available(qcm, file)
            cond2.notify_all()
            cond2.release()
        except FatalError as e:
            print("Exception2 in file:" + file)
            os.system("rm " + inputDir + "/" + fileParts)

        except AssertionError as e:
            print("Exception3 in file:" + file)
            os.system("rm " + inputDir + "/" + fileParts)

        except Exception as e:
            print("Exception in file:" + file)
            os.system("rm " + inputDir + "/" + fileParts)

            #raise Exception
        cond1.acquire()
        while not an_item_is_available(q):
            #print("Thread hail to sleep")
            #time.sleep(300)
            print("Thread hail to wait")
            cond1.wait()

        file = get_an_available_item(q)
        print("Thread hail get item " + file)
        qaws_size = qaws_size - 1
        cond1.release()
    time.sleep(300)
    cond2.acquire()
    print("Thread hail make END available")
    make_an_item_available(qcm, "END")
    cond2.notify_all()
    cond2.release()
示例#8
0
]),
                                            delimiter=':'))

# prep to merge with GWAS variant list
ht_mfi = ht_mfi.key_by('variant')
ht_mfi = ht_mfi.annotate(maf=hl.float(ht_mfi.maf), info=hl.float(ht_mfi.info))
ht_mfi = ht_mfi.select('varid', 'rsid', 'maf', 'info')

#######
# load GWAS variant list
#######

# get GWAS variant list
ht_sites = hl.read_table('gs://ukb31063/ukb31063.neale_gwas_variants.ht')
ht_sites = ht_sites.annotate(
    variant=hl.variant_str(ht_sites.locus, ht_sites.alleles))
ht_sites = ht_sites.key_by('variant')

########
# merge and save
########

# get final merged file with maf/info of the gwas variants
ht = ht_mfi.join(ht_sites, how='inner')
ht = ht.select('locus', 'alleles', 'varid', 'rsid', 'maf', 'info')
print(ht.count())

# save both ht and tsv
ht.write('gs://ukb31063/ukb31063.neale_gwas_variants.imputed_v3.mfi.ht',
         overwrite=True)
ht.export('gs://ukb31063/ukb31063.neale_gwas_variants.imputed_v3.mfi.tsv.bgz')