def export_ldscore(ht, pop): hm3_snps = hl.read_table(get_hm3_snplist_path(pop)) ht = ht.select(CHR=ht.locus.contig, SNP=hl.variant_str(ht.locus, ht.alleles), RSID=ht.rsid, BP=ht.locus.position, L2=ht.ld_score, MAF=0.5 - hl.abs(0.5 - ht.AF)) count = ht.aggregate( hl.struct(M=hl.agg.count(), M_5_50=hl.agg.sum(ht.MAF > 0.05))) ht = ht.filter(hl.is_defined(hm3_snps[ht.locus, ht.alleles])) ht = ht.key_by().drop('locus', 'alleles', 'MAF') with hadoop_open(get_ld_score_flat_file_path(pop, extension='M'), 'w') as f: f.write(f'{count.M}\n') with hadoop_open(get_ld_score_flat_file_path(pop, extension='M_5_50'), 'w') as f: f.write(f'{count.M_5_50}\n') # LD score with variant ids ht.drop('RSID').export(get_ld_score_flat_file_path(pop)) # with rsids ht.transmute(SNP=ht.RSID).export( get_ld_score_flat_file_path(pop, rsid=True))
def run_gwas(vcf_file, phenotypes_file, output_file): table = hl.import_table(phenotypes_file, impute=True).key_by('Sample') hl.import_vcf(vcf_file).write('tmp.mt') mt = hl.read_matrix_table('tmp.mt') mt = mt.annotate_cols(pheno=table[mt.s]) downsampled = mt.sample_rows(0.01, seed=11223344) eigenvalues, pcs, _ = hl.hwe_normalized_pca(downsampled.GT) mt = mt.annotate_cols(scores=pcs[mt.s].scores) gwas = hl.linear_regression_rows( y=mt.pheno.CaffeineConsumption, x=mt.GT.n_alt_alleles(), covariates=[1.0, mt.scores[0], mt.scores[1], mt.scores[2]]) gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles), P=gwas.p_value) gwas = gwas.key_by(gwas.SNP) gwas = gwas.select(gwas.P) gwas.export(f'{output_file}.assoc', header=True) hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)
def run_pca(prune_out: hl.MatrixTable, pca_prefix: str, overwrite: bool = False): """ Run PCA on a dataset :param mt: dataset to run PCA on :param pca_prefix: directory and filename prefix for where to put PCA output :return: """ mt = hl.read_matrix_table(prune_out) pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca( mt.GT, k=20, compute_loadings=True) pca_mt = mt.annotate_rows(pca_af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2) pca_loadings = pca_loadings.annotate( pca_af=pca_mt.rows()[pca_loadings.key].pca_af) pca_scores.write(pca_prefix + 'scores.ht', overwrite) pca_scores = hl.read_table(pca_prefix + 'scores.ht') pca_scores = pca_scores.transmute( **{f'PC{i}': pca_scores.scores[i - 1] for i in range(1, 21)}) pca_scores.export(pca_prefix + 'scores.txt.bgz') # individual-level PCs pca_loadings.export(pca_prefix + 'loadings.txt.bgz') pca_loadings.write(pca_prefix + 'loadings.ht', overwrite) # PCA loadings #export loadings in plink format ht = hl.read_table(pca_prefix + 'loadings.ht') ht = ht.key_by() ht_loadings = ht.select( ID=hl.variant_str(ht.locus, ht.alleles), ALT=ht.alleles[1], **{f"PC{i}": ht.loadings[i - 1] for i in range(1, 21)}) ht_afreq = ht.select( **{ "#ID": hl.variant_str(ht.locus, ht.alleles), "REF": ht.alleles[0], "ALT": ht.alleles[1], "ALT1_FREQ": ht.pca_af }) ht_loadings.export(pca_prefix + 'loadings.plink.tsv') ht_afreq.export(pca_prefix + 'loadings.plink.afreq')
def run_gwas(vcf_file, phenotypes_file, output_file): table = hl.import_table(phenotypes_file, impute=True).key_by('Sample') hl.import_vcf(vcf_file).write('tmp.mt') mt = hl.read_matrix_table('tmp.mt') mt = mt.annotate_cols(pheno=table[mt.s]) mt = hl.sample_qc(mt) mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4) & (mt.sample_qc.call_rate >= 0.97)) ab = mt.AD[1] / hl.sum(mt.AD) filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1)) | (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75)) | (mt.GT.is_hom_var() & (ab >= 0.9))) mt = mt.filter_entries(filter_condition_ab) mt = hl.variant_qc(mt) mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01) eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT) mt = mt.annotate_cols(scores=pcs[mt.s].scores) gwas = hl.linear_regression_rows(y=mt.pheno.CaffeineConsumption, x=mt.GT.n_alt_alleles(), covariates=[ 1.0, mt.pheno.isFemale, mt.scores[0], mt.scores[1], mt.scores[2] ]) gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles), P=gwas.p_value) gwas = gwas.key_by(gwas.SNP) gwas = gwas.select(gwas.P) gwas.export(f'{output_file}.assoc', header=True) hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)
BROWSER_GENE_RESULTS_TABLE = 'gs://dalio_bipolar_w1_w2_hail_02_browser/data/ht/browser_gene_results_table.ht' ht = mt.rows().key_by('locus', 'alleles') # Throw away everything except what we need for variant annotations ht = ht.select(transcript_csq = ht.annotation.vep.transcript_consequences, worst_csq_for_variant_canonical = ht.annotation.vep.worst_csq_for_variant_canonical, csq_analysis = ht.annotation.consequence_category, csq_worst = ht.annotation.vep.most_severe_consequence, cadd = ht.annotation.cadd.PHRED_score, mpc = ht.annotation.mpc.MPC) # Explode on consequence category ht = ht.explode('transcript_csq', name='transcript_csq') ht = ht.select(variant_id = hl.variant_str(ht.locus, ht.alleles), gene_id = ht.worst_csq_for_variant_canonical.gene_id, gene_name = ht.worst_csq_for_variant_canonical.gene_symbol, canonical_transcript_id = ht.worst_csq_for_variant_canonical.transcript_id, transcript_id = ht.transcript_csq.transcript_id, hgvsc_canonical = ht.worst_csq_for_variant_canonical.hgvsc, hgvsc = ht.transcript_csq.hgvsc, hgvsp_canonical = ht.worst_csq_for_variant_canonical.hgvsp, hgvsp = ht.transcript_csq.hgvsp, csq_analysis = ht.csq_analysis, csq_worst = ht.csq_worst, csq_canonical = ht.worst_csq_for_variant_canonical.most_severe_consequence, cadd = ht.cadd, mpc = ht.mpc, polyphen = ht.worst_csq_for_variant_canonical.polyphen_prediction).write(BROWSER_VARIANT_ANNOTATION_TABLE, overwrite=True)
contig_recoding=recoding_dict) ### filter to pass variants and split_multi mt2 = mt.filter_rows(mt.filters.size() > 0, keep=False) mt2 = hl.split_multi_hts(mt2) #variant read counts of 3 #at least one read in both forward and reverse orientations #remove monomorphic variants mt3 = mt2.filter_entries( ((mt2.AD[1] < 2) | (mt2.F1R2[1] == 0) | (mt2.F2R1[1] == 0)), keep=False) mt3 = hl.variant_qc(mt3) mt3 = mt3.filter_rows( (mt3.variant_qc.AF[1] > 0) & (mt3.variant_qc.AF[1] < 1), keep=True) mt4 = mt3.annotate_rows(v = hl.variant_str(mt3.locus, mt3.alleles),\ NumAltAlleles = hl.agg.max(mt3.GT.n_alt_alleles()), \ VAF =hl.agg.explode(lambda x: hl.agg.mean(x), mt3.AF),\ TLOD =mt3.info.TLOD[0], \ GERMQ = mt3.info.GERMQ, \ STR=mt3.info.STR,\ AD_alt=hl.agg.mean(mt3.AD[1]),\ AD_ref=hl.agg.mean(mt3.AD[0])) mt4 = mt4.annotate_entries( Binomial_Prob=hl.binom_test(mt4.AD[1], mt4.DP, 0.5, 'greater')) mt4 = mt4.key_rows_by("v") mt4 = mt4.drop('locus', 'alleles', 'qual', 'filters', 'variant_qc', 'GQ', 'PGT', 'PID', 'PL', 'PS', 'info', 'rsid', 'a_index', 'was_split') filt2 = mt4.count_rows()
def hailthread(cond1, q, cond2, qcm, inputDir, outputDir, qaws_size): #Load id_conversion file #table_idconv=hl.import_table('id_conversion') #Load markers files #table_makers_pos=hl.import_table('800k_to_extract_indexed2.txt',delimiter=':',no_header=True,impute=True) #table_markers_all=hl.import_table('800k_to_extract_indexed_alleles_gt2.txt',delimiter=':',no_header=True,impute=True) #cut -f 1 -d',' 800k_to_extract_indexed2.txt > interval_table #awk -F':' '{print $1"\t"$2"\t"$2}' interval_table > interval_table2 hl.init() cond1.acquire() while not an_item_is_available(q): #print("Thread hail to sleep") #time.sleep(300) print("Thread hail to wait") cond1.wait() file = get_an_available_item(q) print("Thread hail get item " + file) qaws_size = qaws_size - 1 cond1.release() interval_table = hl.import_locus_intervals('interval_table2', reference_genome='GRCh38') while file != "END": fileParts = file.split("/")[-1] fileName = fileParts.replace(".vcf.gz", "").replace(".gvcf.gz", "") chrName = fileName.split("_")[-3] #myFNAL=fileName.split("\\.") #myTempId=myFNAL[0] #Load gVCF file #data=hl.import_vcf("/mnt/vol1/java/gel_test.vcf",force_bgz=True,reference_genome='GRCh38') #data=hl.import_vcf("/mnt/vol1/java/gel_mainProgramme_aggV2_chr10_129040437_131178399.vcf.gz",force_bgz=True,reference_genome='GRCh38') try: data = hl.import_vcf(inputDir + "/" + fileParts, force_bgz=True, reference_genome='GRCh38') #data=hl.import_vcf(file.replace("s3://","s3a://"),force_bgz=True,reference_genome='GRCh38') #Filters PASS if chrName != "chrY": data = data.filter_rows(data.filters.size() > 0, keep=False) #Multiallelic data = hl.split_multi_hts(data) #Join with markers data_filtered = data.filter_rows( hl.is_defined(interval_table[data.locus])) #Replace with 0s and 1s #Export #data_filtered_annot=data_filtered.annotate_entries(output=(data_filtered.GT.is_het()|data_filtered.GT.is_hom_var())) #Cambiamos la key para que contenga los alelos y no simplemente la posición variant_key = data_filtered.key_rows_by(variant=hl.variant_str( data_filtered.locus, data_filtered.alleles)) #Exportamos el campo creado anteriormente y la cuenta del número de alelos ALT #variant_key.GT.n_alt_alleles().export(outputDir+"/"+fileName+".tsv") #Change to export GT and not only the number of alt alleles because sex chromosomes need to check GT variant_key.GT.export(outputDir + "/" + fileName + ".tsv") #Extract INFO fields data = hl.import_vcf(inputDir + "/" + fileParts, force_bgz=True, reference_genome='GRCh38', drop_samples=True) #Filters PASS if chrName != "chrY": data = data.filter_rows(data.filters.size() > 0, keep=False) #Multiallelic data = hl.split_multi_hts(data) #Join with markers data_filtered = data.filter_rows( hl.is_defined(interval_table[data.locus])) if chrName != "chrY": data_sr = data_filtered.select_rows( data_filtered.info.medianDepthAll, data_filtered.info.medianDepthNonMiss, data_filtered.info.medianGQ, data_filtered.info.missingness, data_filtered.info.completeGTRatio, data_filtered.info.ABratio, data_filtered.info.MendelSite, data_filtered.info.AN, data_filtered.info.AC, data_filtered.info.AC_Hom, data_filtered.info.AC_Het) else: data_sr = data_filtered.select_rows(data_filtered.info.AN, data_filtered.info.AC, data_filtered.info.AC_Hom, data_filtered.info.AC_Het) ht = data_sr.make_table() ht.export(outputDir + "/" + fileName + "_INFO.tsv") os.system("sed -i 's/\[//g' " + outputDir + "/" + fileName + "_INFO.tsv") os.system("sed -i 's/]//g' " + outputDir + "/" + fileName + "_INFO.tsv") os.system("cat " + outputDir + "/" + fileName + "_INFO.tsv | grep -v locus " + " >> " + outputDir + "/INFO_" + chrName) cond2.acquire() print("Thread hail make item available " + fileName) make_an_item_available(qcm, file) cond2.notify_all() cond2.release() except FatalError as e: print("Exception2 in file:" + file) os.system("rm " + inputDir + "/" + fileParts) except AssertionError as e: print("Exception3 in file:" + file) os.system("rm " + inputDir + "/" + fileParts) except Exception as e: print("Exception in file:" + file) os.system("rm " + inputDir + "/" + fileParts) #raise Exception cond1.acquire() while not an_item_is_available(q): #print("Thread hail to sleep") #time.sleep(300) print("Thread hail to wait") cond1.wait() file = get_an_available_item(q) print("Thread hail get item " + file) qaws_size = qaws_size - 1 cond1.release() time.sleep(300) cond2.acquire() print("Thread hail make END available") make_an_item_available(qcm, "END") cond2.notify_all() cond2.release()
]), delimiter=':')) # prep to merge with GWAS variant list ht_mfi = ht_mfi.key_by('variant') ht_mfi = ht_mfi.annotate(maf=hl.float(ht_mfi.maf), info=hl.float(ht_mfi.info)) ht_mfi = ht_mfi.select('varid', 'rsid', 'maf', 'info') ####### # load GWAS variant list ####### # get GWAS variant list ht_sites = hl.read_table('gs://ukb31063/ukb31063.neale_gwas_variants.ht') ht_sites = ht_sites.annotate( variant=hl.variant_str(ht_sites.locus, ht_sites.alleles)) ht_sites = ht_sites.key_by('variant') ######## # merge and save ######## # get final merged file with maf/info of the gwas variants ht = ht_mfi.join(ht_sites, how='inner') ht = ht.select('locus', 'alleles', 'varid', 'rsid', 'maf', 'info') print(ht.count()) # save both ht and tsv ht.write('gs://ukb31063/ukb31063.neale_gwas_variants.imputed_v3.mfi.ht', overwrite=True) ht.export('gs://ukb31063/ukb31063.neale_gwas_variants.imputed_v3.mfi.tsv.bgz')