def startTestHailContext(): global _initialized if not _initialized: url = os.environ.get('HAIL_TEST_SERVICE_BACKEND_URL') if url: hl.init(master='local[2]', min_block_size=0, quiet=True, _backend=hl.backend.ServiceBackend(url)) else: hl.init(master='local[2]', min_block_size=0, quiet=True) _initialized = True
def initialize(cores, log, n_iter): assert not _initialized hl.init(master=f'local[{cores}]', quiet=True, log=log) global _n_iter _n_iter = n_iter download_data() # make JVM do something to ensure that it is fresh hl.utils.range_table(1)._force_count()
def main(): parser = argparse.ArgumentParser(description="Driver for hail's gVCF combiner") parser.add_argument('--sample-map', help='path to the sample map (must be filesystem local)', required=True) parser.add_argument('--sample-file', help='path to a file containing a line separated list' 'of samples to combine (must be filesystem local)') parser.add_argument('--tmp-path', help='path to folder for temp output (can be a cloud bucket)', default='/tmp') parser.add_argument('--out-file', '-o', help='path to final combiner output', required=True) parser.add_argument('--summarize', help='if defined, run summarize, placing the rows table ' 'of the output at the argument value') parser.add_argument('--json', help='json to use for the import of the gVCFs' '(must be filesystem local)', required=True) args = parser.parse_args() samples = build_sample_list(args.sample_map, args.sample_file) with open(args.json) as j: json = j.read() hl.init(default_reference=DEFAULT_REF, log='/hail-joint-caller-' + time.strftime('%Y%m%d-%H%M') + '.log') run_combiner(samples, json, args.out_file, args.tmp_path, summary_path=args.summarize, overwrite=True)
def main(): hl.init() data = hl.import_vcf( os.path.join(PROJECT_DIR, 'data/chr22_1000_missing.vcf')) labels = hl.import_table(os.path.join(PROJECT_DIR, 'data/chr22-labels.csv'), delimiter=',', types={ '22_16050408': 'float64' }).key_by('sample') mt = data.annotate_cols(pheno=labels[data.s]) y = mt.pheno['22_16050408'] x = mt.GT.n_alt_alleles() mt = matrix_table_source('random_forest_model/x', x) check_entry_indexed('random_forest_model/x', x) mts = mt._select_all(col_exprs=dict(y=y), row_exprs=dict(), col_key=[], entry_exprs=dict(e=x)) mts.write(os.path.join( PROJECT_DIR, 'src/test/data/hail/chr22_1000_missing-22_16050408.vds'), overwrite=True)
def query(output): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') gnomad_loadings_path = f'{output}/gnomad_loadings_90k_liftover.ht' # liftover and get variants ht_gnomad_loadings = hl.read_table(GNOMAD_V2_LOADINGS) rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') rg37.add_liftover( 'gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38) ht_gnomad_loadings_liftover = ht_gnomad_loadings.annotate( liftover=hl.liftover(ht_gnomad_loadings.locus, 'GRCh38', include_strand=False), old_locus=ht_gnomad_loadings.locus, ) ht_gnomad_loadings_liftover = ht_gnomad_loadings_liftover.key_by( locus=ht_gnomad_loadings_liftover.liftover) # save gnomad loadings ht_gnomad_loadings_liftover.write(gnomad_loadings_path, overwrite=True)
def main(args): hl.init( log=f"/variant_filter.log", tmp_dir="gs://ccdg-30day-temp/", default_reference="GRCh38", ) # TODO: This flag can be removed if this error is no longer relevant: log4j:ERROR Failed to flush writer, # java.io.IOException: No space left on device when trying to write a densified MT from VDS hl._set_flags(distributed_scan_comb_op="1") if args.update_ccdg_exome_interval_table: ccdg_interval_qc_ht(args.pct_samples_defined, overwrite=True) determine_pca_variants( autosomes_only=not args.not_autosomes_only, bi_allelic_only=not args.not_bi_allelic_only, adj_only=not args.not_adj_only, snv_only=not args.not_snv_only, min_gnomad_v3_ac=args.gnomad_v3_ac_filter, high_qual_ccdg_exome_interval_only=not args.not_high_qual_ccdg_interval_only, high_qual_ukbb_exome_interval_only=not args.not_high_qual_ukbb_interval_only, filter_lcr=not args.not_filter_lcr, filter_segdup=not args.not_filter_segdup, min_joint_af=args.min_af, min_joint_callrate=args.min_callrate, min_ccdg_exome_callrate=args.ccdg_exome_callrate_cutoff, min_ukbb_exome_callrate=args.ukbb_exome_callrate_cutoff, ld_pruning=not args.not_ld_pruning, ld_pruning_dataset=args.ld_pruning_dataset, ld_r2=args.ld_r2, read_per_dataset_checkpoint_if_exists=args.read_per_dataset_checkpoint_if_exists, read_pre_ld_prune_ht_checkpoint_if_exists=args.read_pre_ld_prune_ht_checkpoint_if_exists, read_pre_ld_prune_mt_checkpoint_if_exists=args.read_pre_ld_prune_mt_checkpoint_if_exists, overwrite=args.overwrite, filter_washu=args.filter_washu, )
def query(output): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') loadings_ht = hl.read_table(LOADINGS) number_of_pcs = hl.len(loadings_ht.loadings).take(1)[0] for i in range(0, (number_of_pcs)): pc = i + 1 p = manhattan_loadings( pvals=hl.abs(loadings_ht.loadings[i]), locus=loadings_ht.locus, title='Loadings of PC ' + str(pc), collect_all=True, ) plot_filename = f'{output}/loadings_manhattan_plot_pc' + str( pc) + '.png' with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(p).save(f, format='PNG') plot_filename_html = 'loadings_pc' + str(pc) + '.html' output_file(plot_filename_html) save(p) subprocess.run(['gsutil', 'cp', plot_filename_html, output], check=False)
def main(): use_tabix = True hl.init(log='/Users/nbaya/Downloads/get_chr_pos.log') backend = hb.ServiceBackend(billing_project='ukb_diverse_pops', bucket='ukbb-diverse-temp-30day/nb-batch-tmp') b = hb.Batch(name='get_chr_pos', backend=backend, default_image='gcr.io/ukbb-diversepops-neale/nbaya_tabix:latest', default_storage='2G', default_cpu=1) paths = get_paths() for path in paths: print(path) annotate_chr_pos(b=b, path=path, use_tabix=use_tabix) b.run(open=True) backend.close()
import hail as hl import logging import os import pandas as pd import re import subprocess logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO) logger = logging.getLogger(__name__) hl.init(log="/dev/null") #%% ht = hl.read_table( "gs://gnomad/metadata/genomes_v3.1/gnomad_v3.1_sample_qc_metadata.ht") ht = ht.filter(ht.release) release_samples = ht.s.collect() #%% sample_ids_gnomad_v3 = hl.hadoop_open( "gs://gnomad-bw2/sample_ids_gnomad_v3__20210131.txt").read().split("\n") release_sample_ids_gnomad_v3 = list( set(sample_ids_gnomad_v3) & set(release_samples)) # 39285 samples sample_ids_gnomad_v3_1 = hl.hadoop_open( "gs://gnomad-bw2/sample_ids_gnomad_v3_1__20210131.txt").read().split("\n") release_sample_ids_gnomad_v3_1 = list( set(sample_ids_gnomad_v3_1) & set(release_samples)) # 3526 samples
import functools as ft import json import os import uvloop from aiohttp import web import jwt import hail as hl from hail.utils import FatalError from hail.utils.java import Env, info, scala_object import hailjwt as hj uvloop.install() master = os.environ.get('HAIL_APISERVER_SPARK_MASTER') hl.init(master=master, min_block_size=0) app = web.Application() routes = web.RouteTableDef() with open(os.environ.get('HAIL_JWT_SECRET_KEY_FILE') or '/jwt-secret/secret-key') as f: jwtclient = hj.JWTClient(f.read()) def authenticated_users_only(fun): @ft.wraps(fun) def wrapped(request, *args, **kwargs): encoded_token = request.cookies.get('user') if encoded_token is not None: try:
def main(args): # Init Hail hl.init(default_reference=args.default_ref_genome) # Import VEPed VCF file as MatrixTable and get VCF file meta-data # vcf_path = args.vcf_vep_path mt = hl.import_vcf(path=get_vep_vqsr_vcf_path(), force_bgz=args.force_bgz) # getting annotated VEP fields names from VCF-header vep_fields = get_vep_fields(vcf_path=get_vep_vqsr_vcf_path(), vep_csq_field=args.csq_field) if args.split_multi_allelic: # split multi-allelic variants mt = hl.split_multi_hts(mt) # split/annotate fields in the info field (use allele index ) mt = mt.annotate_rows(info=mt.info.annotate( **{field: mt.info[field][mt.a_index - 1] for field in INFO_FIELDS})) # parse/annotate the CSQ field in a different structure tb_csq = mt.rows() tb_csq = (tb_csq.annotate(csq_raw=tb_csq.info[args.csq_field])) # Convert/annotate all transcripts per variants with a structure of type array<dict<str, str>>. # The transcript(s) are represented as a dict<k,v>, where keys are the field names extracted from the VCF header and # the values are the current annotated values in the CSQ field. tb_csq = (tb_csq.annotate(csq_raw=tb_csq.csq_raw.map( lambda x: hl.dict(hl.zip(vep_fields, x.split('[|]')))))) # Keep transcript(s) matching with the allele index (only used if variant were split with split_multi_hts) # It requires having the flag "ALLELE_NUM" annotated by VEP # Apply only were the alleles were split. # TODO: Handle exception when the flag "ALLELE_NUM" is not present if all( [x in list(tb_csq._fields.keys()) for x in ['was_split', 'a_index']]): tb_csq = (tb_csq.annotate(csq_raw=hl.cond( tb_csq.was_split, tb_csq.csq_raw.filter(lambda x: (hl.int(x["ALLELE_NUM"]) == tb_csq. a_index)), tb_csq.csq_raw))) # select and annotate one transcript per variant based on pre-defined rules tb_csq = pick_transcript( ht=tb_csq, csq_array='csq_raw', ) # Expand selected transcript (dict) annotations adding independent fields. tb_csq = annotate_from_dict(ht=tb_csq, dict_field='tx', output_filed='vep') # Parse the "Consequence" field. Keep only the more severe consequence. # Avoid the notation "consequence_1&consequence_2" tb_csq = (tb_csq.annotate(vep=tb_csq.vep.annotate( Consequence=tb_csq.vep.Consequence.split('&')[0]))) # Parse the protein DOMAIN field if 'DOMAINS' in vep_fields: tb_csq = (tb_csq.annotate(vep=tb_csq.vep.annotate( DOMAINS=vep_protein_domain_ann_expr(tb_csq.vep['DOMAINS'])))) # drop redundant/temp fields tb_csq = (tb_csq.drop('csq_raw', 'tx').repartition(500)) # print fields overview tb_csq.describe() # write table as HailTable to disk # (tb_csq # .write(output=args.tb_output_path, # overwrite=args.overwrite) # ) output_path = get_variant_qc_ht_path(part='vep_vqsr', split=args.split_multi_allelic) tb_csq = (tb_csq.checkpoint(output=output_path, overwrite=args.overwrite)) if args.write_to_file: # write table to disk as a BGZ-compressed TSV file (tb_csq.export(f'{output_path}.tsv.bgz')) # Stop Hail hl.stop()
def query(): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') scores = hl.read_table(SCORES) scores = scores.annotate(cohort_sample_codes=hl.if_else( scores.s.contains('snp_chip'), 'snp_chip', 'tob_wgs')) labels = scores.cohort_sample_codes hover_fields = dict([('s', scores.s)]) # get percent variance explained eigenvalues = hl.import_table(EIGENVALUES) eigenvalues = eigenvalues.to_pandas() eigenvalues.columns = ['eigenvalue'] eigenvalues = pd.to_numeric(eigenvalues.eigenvalue) variance = eigenvalues.divide(float(eigenvalues.sum())) * 100 variance = variance.round(2) # Get number of PCs number_of_pcs = len(eigenvalues) for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 print(f'PC{pc1 + 1} vs PC{pc2 + 1}') p = hl.plot.scatter( scores.scores[pc1], scores.scores[pc2], label=labels, title='TOB-WGS + TOB SNP Chip', xlabel='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)', ylabel='PC' + str(pc2 + 1) + ' (' + str(variance[pc2]) + '%)', hover_fields=hover_fields, ) plot_filename = output_path('pc' + str(pc2) + '.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(p).save(f, format='PNG') html = file_html(p, CDN, 'my plot') plot_filename_html = output_path(f'pc{pc2}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html) # Get partner sample information sample_names = scores.s.collect() def sample_type(sample_name): if sample_name.endswith('snp_chip'): partner_name = re.sub('_snp_chip', '', sample_name) tech = 'snp' else: partner_name = sample_name + '_snp_chip' tech = 'wgs' if partner_name in sample_names: prefix = 'dual_' else: prefix = '' return prefix + tech # save as html labels = list(map(sample_type, sample_names)) html = pd.DataFrame({ 'sample_name': sample_names, 'sample_tech': labels }).to_html() plot_filename_html = output_path(f'sample_technology.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html) # plot cohort_sample_codes = list(set(labels)) tooltips = [('labels', '@label'), ('samples', '@samples')] for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 plot = figure( title='Reprocessed Sample Projection', x_axis_label='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)', y_axis_label='PC' + str(pc2 + 1) + ' (' + str(variance[pc1]) + '%)', tooltips=tooltips, ) source = ColumnDataSource( dict( x=scores.scores[pc1].collect(), y=scores.scores[pc2].collect(), label=labels, samples=sample_names, )) plot.circle( 'x', 'y', alpha=0.5, source=source, size=8, color=factor_cmap('label', Dark2[len(cohort_sample_codes)], cohort_sample_codes), legend_group='label', ) plot.add_layout(plot.legend[0], 'left') plot_filename = output_path('technology_type_pc' + str(pc2) + '.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(plot).save(f, format='PNG') html = file_html(plot, CDN, 'my plot') plot_filename_html = output_path(f'technology_type_pc{pc2}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html)
#!/usr/bin/env python3 import hail as hl hl.init(tmp_dir='/net/scratch/people/plggosborcz', spark_conf={'spark.driver.memory': '90G', 'spark.executor.memory': '90G'}, default_reference='GRCh38') europeans = hl.import_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/external-data/1kg/europeans', delimiter = "\t", no_header = True) to_keep = europeans['f0'].collect() controls = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/external-data/1kg/gnomad.genomes.v3.1.2.hgdp_1kg_subset_sparse.mt') controls = controls.filter_cols(hl.literal(to_keep).contains(controls.s)) controls.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/external-data/1kg/1kg-europeans-sparse.ht')
def main(args): hl.init(default_reference="GRCh38", log="/variant_histograms.log") logger.info("Loading ANNOTATIONS_HISTS dictionary...") if not file_exists(annotation_hists_path()): raise DataException( "Annotation hists JSON file not found. Need to create this JSON before running script!" ) with hl.hadoop_open(annotation_hists_path()) as a: ANNOTATIONS_HISTS = json.loads(a.read()) # NOTE: histogram aggregations on these metrics are done on the entire callset (not just PASS variants), on raw data ht = hl.read_table(release_ht_path(public=False)) ht = ht.select(freq=ht.freq, info=ht.info.select(*ANNOTATIONS_HISTS)) inbreeding_bin_ranges = ANNOTATIONS_HISTS["InbreedingCoeff"] # Remove InbreedingCoeff from ANNOTATIONS_HISTS. It requires different ranges by allele frequency and needs to be # handled differently. It is stored as a dictionary in annotation_hists_path ANNOTATIONS_HISTS.remove("InbreedingCoeff") logger.info("Getting info annotation histograms...") hist_ranges_expr = get_annotations_hists(ht, ANNOTATIONS_HISTS, LOG10_ANNOTATIONS) # Evaluate minimum and maximum values for each metric of interest to help determine the bounds of the hists # NOTE: Run this first, then update values in annotation_hists_path JSON as necessary if args.determine_bounds: logger.info( "Evaluating minimum and maximum values for each metric of interest. Maximum values capped at 1e10." ) minmax_dict = {} for metric in ANNOTATIONS_HISTS: minmax_dict[metric] = hl.struct( min=hl.agg.min(ht.info[metric]), max=hl.if_else( hl.agg.max(ht.info[metric]) < 1e10, hl.agg.max(ht.info[metric]), 1e10, ), ) minmax = ht.aggregate(hl.struct(**minmax_dict)) logger.info(f"Metrics bounds: {minmax}") else: logger.info( "Aggregating hists over ranges defined in the annotation_hists_path JSON file. --determine_bounds can " "be used to help define these ranges..." ) hists = ht.aggregate( hl.array( [ hist_expr.annotate(metric=hist_metric) for hist_metric, hist_expr in hist_ranges_expr.items() ] ) .extend( hl.array( hl.agg.group_by( create_frequency_bins_expr(AC=ht.freq[1].AC, AF=ht.freq[1].AF), hl.agg.hist( hl.log10(ht.info.QUALapprox), *ANNOTATIONS_HISTS["QUALapprox"], ), ) ).map(lambda x: x[1].annotate(metric="QUALapprox-" + x[0])) ) .extend( hl.array( hl.agg.group_by( create_frequency_bins_expr(AC=ht.freq[1].AC, AF=ht.freq[1].AF), hl.agg.hist( hl.log10(ht.info.AS_QUALapprox), *ANNOTATIONS_HISTS["AS_QUALapprox"], ), ) ).map(lambda x: x[1].annotate(metric="AS_QUALapprox-" + x[0])) ), _localize=False, ) # Defining hist range and bins for allele frequency groups because they needed different ranges ht = ht.annotate(af_bin=create_frequency_bins_expr_inbreeding(AF=ht.freq[1].AF)) inbreeding_hists = [ ht.aggregate( hl.agg.filter( ht.af_bin == x, hl.agg.hist(ht.info.InbreedingCoeff, *inbreeding_bin_ranges[x],), ) ).annotate(metric="InbreedingCoeff" + "-" + x) for x in inbreeding_bin_ranges ] hists = hl.eval(hl.json(hists)) inbreeding_hists = hl.eval(hl.json(inbreeding_hists)) # Note: The following removes '}' from the JSON stored in hists and '{' from the JSON stored in # inbreeding_hists then joins them together to be written out as a single JSON hists = hists[:-1] + "," + inbreeding_hists[1:] logger.info("Writing output") with hl.hadoop_open(qual_hists_json_path(), "w") as f: f.write(hists)
#ht_out = ht_out.annotate(**covs[ht_out.key]) ht_comb = ht_out.select(*p_max.keys(), age=ht_out.phenotypes.age, sex=ht_out.phenotypes.sex, pheno=ht_out.phenotypes[pheno]) output_location = args.ss_clump_prefix + pheno + '_apcdr_PRS' #ht_comb.describe() #ht_comb.write(output_location + '.ht', overwrite=args.overwrite) #ht_comb = hl.read_table(output_location + '.ht') ht_comb.export(output_location + '.txt.bgz') end = time.time() print("Success! Job was completed in %s" % time.strftime("%H:%M:%S", time.gmtime(end - start))) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( '--ss_clump_prefix', default='gs://apcdr/prs_sumstats_clump/apcdr_ukb_10k_eur_holdout_meta/' ) parser.add_argument('--ss_suffix', default='.meta.bgz') parser.add_argument('--chr_pos_ref_alt_p_beta', default='CHR,POS,A1,A2,P,BETA') parser.add_argument('--overwrite', action='store_true') args = parser.parse_args() hl.init(log='/prs.log') main(args)
pca_mt.write(f"{args.output_dir}/mt_pca.mt", overwrite=True) p = hl.plot.scatter(pca_mt.scores[0], pca_mt.scores[1], title='PCA', xlabel='PC1', ylabel='PC2') output_file(f"{args.plot_dir}/pca.html") save(p) if __name__ == "__main__": # need to create spark cluster first before intiialising hail sc = pyspark.SparkContext() # Define the hail persistent storage directory hl.init(sc=sc, tmp_dir=tmp_dir, default_reference="GRCh38") # s3 credentials required for user to access the datasets in farm flexible compute s3 environment # you may use your own here from your .s3fg file in your home directory hadoop_config = sc._jsc.hadoopConfiguration() hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"]) hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"]) ##################################################################### ###################### INPUT DATA ############################## ##################################################################### parser = argparse.ArgumentParser() # Read the matrixtable, chrX and chrY should be included input_params = parser.add_argument_group("Input parameters") input_params.add_argument( "--matrixtable",
def hailthread(cond1, q, cond2, qcm, inputDir, outputDir, qaws_size): #Load id_conversion file #table_idconv=hl.import_table('id_conversion') #Load markers files #table_makers_pos=hl.import_table('800k_to_extract_indexed2.txt',delimiter=':',no_header=True,impute=True) #table_markers_all=hl.import_table('800k_to_extract_indexed_alleles_gt2.txt',delimiter=':',no_header=True,impute=True) #cut -f 1 -d',' 800k_to_extract_indexed2.txt > interval_table #awk -F':' '{print $1"\t"$2"\t"$2}' interval_table > interval_table2 hl.init() cond1.acquire() while not an_item_is_available(q): #print("Thread hail to sleep") #time.sleep(300) print("Thread hail to wait") cond1.wait() file = get_an_available_item(q) print("Thread hail get item " + file) qaws_size = qaws_size - 1 cond1.release() interval_table = hl.import_locus_intervals('interval_table2', reference_genome='GRCh38') while file != "END": fileParts = file.split("/")[-1] fileName = fileParts.replace(".vcf.gz", "").replace(".gvcf.gz", "") chrName = fileName.split("_")[-3] #myFNAL=fileName.split("\\.") #myTempId=myFNAL[0] #Load gVCF file #data=hl.import_vcf("/mnt/vol1/java/gel_test.vcf",force_bgz=True,reference_genome='GRCh38') #data=hl.import_vcf("/mnt/vol1/java/gel_mainProgramme_aggV2_chr10_129040437_131178399.vcf.gz",force_bgz=True,reference_genome='GRCh38') try: #Extract INFO fields data = hl.import_vcf(inputDir + "/" + fileParts, force_bgz=True, reference_genome='GRCh38', drop_samples=True) #Filters PASS if chrName != "chrY": data = data.filter_rows(data.filters.size() > 0, keep=False) #Multiallelic data = hl.split_multi_hts(data) #Join with markers data_filtered = data.filter_rows( hl.is_defined(interval_table[data.locus])) data_sr = data_filtered.select_rows( data_filtered.info.medianDepthAll, data_filtered.info.medianDepthNonMiss, data_filtered.info.medianGQ, data_filtered.info.missingness, data_filtered.info.completeGTRatio, data_filtered.info.ABratio, data_filtered.info.MendelSite, data_filtered.info.AN, data_filtered.info.AC, data_filtered.info.AC_Hom, data_filtered.info.AC_Het) ht = data_sr.make_table() ht.export(outputDir + "/" + fileName + "_INFO.tsv") os.system("sed -i 's/\[//g' " + outputDir + "/" + fileName + "_INFO.tsv") os.system("sed -i 's/]//g' " + outputDir + "/" + fileName + "_INFO.tsv") os.system("cat " + outputDir + "/" + fileName + "_INFO.tsv | grep -v locus " + " >> " + outputDir + "/INFO_" + chrName) os.system("rm " + inputDir + "/" + fileParts) cond2.acquire() print("Thread hail make item available " + fileName) make_an_item_available(qcm, file) cond2.notify_all() cond2.release() except FatalError as e: print("Exception2 in file:" + file) os.system("rm " + inputDir + "/" + fileParts) except AssertionError as e: print("Exception3 in file:" + file) os.system("rm " + inputDir + "/" + fileParts) except Exception as e: print("Exception in file:" + file) os.system("rm " + inputDir + "/" + fileParts) #raise Exception cond1.acquire() while not an_item_is_available(q): #print("Thread hail to sleep") #time.sleep(300) print("Thread hail to wait") cond1.wait() file = get_an_available_item(q) print("Thread hail get item " + file) qaws_size = qaws_size - 1 cond1.release() time.sleep(300) cond2.acquire() print("Thread hail make END available") make_an_item_available(qcm, "END") cond2.notify_all() cond2.release()
def startTestHailContext(): global _initialized if not _initialized: hail.init(master='local[2]', min_block_size=0, quiet=True) _initialized = True
import hail as hl from hail.utils.java import Env, info import logging import flask hl.init() app = flask.Flask('hail-apiserver') @app.route('/execute', methods=['POST']) def execute(): code = flask.request.json info(f'execute: {code}') jir = Env.hail().expr.ir.IRParser.parse_value_ir(code, {}, {}) typ = hl.HailType._from_java(jir.typ()) value = Env.hail().expr.ir.Interpret.interpretPyIR(code, {}, {}) result = { 'type': str(typ), 'value': value } info(f'result: {result}') return flask.jsonify(result)
import json import hail as hl gvcfs = ['gs://hail-ci/gvcfs/HG00096.g.vcf.gz', 'gs://hail-ci/gvcfs/HG00268.g.vcf.gz'] hl.init(default_reference='GRCh38') parts = [ {'start': {'locus': {'contig': 'chr20', 'position': 17821257}}, 'end': {'locus': {'contig': 'chr20', 'position': 18708366}}, 'includeStart': True, 'includeEnd': True}, {'start': {'locus': {'contig': 'chr20', 'position': 18708367}}, 'end': {'locus': {'contig': 'chr20', 'position': 19776611}}, 'includeStart': True, 'includeEnd': True}, {'start': {'locus': {'contig': 'chr20', 'position': 19776612}}, 'end': {'locus': {'contig': 'chr20', 'position': 21144633}}, 'includeStart': True, 'includeEnd': True}, ] parts_str = json.dumps(parts) vcfs = hl.import_vcfs(gvcfs, parts_str)
def handler(signum, frame): global _timeout_state _timeout_state = True hl.stop() hl.init(**_init_args) raise BenchmarkTimeoutError()
def main(): # # Args (local) # chrom = 11 # chain_file = '/Users/em21/Projects/ot_genetics/genetics-sumstats_data/extras/prepare_uk_biobank_gwas_catalog/sitelist/input_data/grch37_to_grch38.over.chain.gz' # in_bgen = 'example_data/ukb_imp_chr{chrom}_v3.example.bgen' # in_sample = 'output/ukb_10k_downsampled.sample' # to_keep_list = 'output/ukb_10k_downsampled.sample_list.tsv' # out_plink = 'output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k' # cores = 1 # Use "*" for all # maf_threshold = 0.001 # Args (server) chrom = sys.argv[1] chain_file = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/grch37_to_grch38.over.chain.gz' in_bgen = '/nfs/users/nfs_e/em21/otcoregen/uk_biobank_data/data/genetics/imputation/ukb_imp_chr{chrom}_v3.bgen' in_sample = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample' to_keep_list = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample_list.tsv' out_plink = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k' cores = sys.argv[2] # Use "*" for all maf_threshold = 0.001 # Set the maximum number of cores hl.init(master="local[{}]".format(cores)) # Prepare liftover rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') rg37.add_liftover(chain_file, rg38) # Create my own rg38 with altered names rg38_custom_contigs = [ contig.replace('chr', '') for contig in rg38.contigs ] rg38_custom_lens = {} for contig in rg38.lengths: rg38_custom_lens[contig.replace('chr', '')] = rg38.lengths[contig] rg38_custom = hl.ReferenceGenome('rg38_custom', rg38_custom_contigs, rg38_custom_lens) print('Processing chromosome {0}'.format(chrom)) # Index bgen if not existing if not hl.hadoop_exists(in_bgen.format(chrom=chrom) + '.idx2'): hl.index_bgen(in_bgen.format(chrom=chrom), contig_recoding={ "01": "1", "02": "2", "03": "3", "04": "4", "05": "5", "06": "6", "07": "7", "08": "8", "09": "9" }, reference_genome='GRCh37') # Load bgen mt = hl.import_bgen(in_bgen.format(chrom=chrom), entry_fields=['GT'], sample_file=in_sample) # Load list samples to keep samples_to_keep = hl.import_table(to_keep_list, no_header=True, impute=False, types={ 'f0': hl.tstr }).key_by('f0') # Downsample to required subset of samples mt = mt.filter_cols(hl.is_defined(samples_to_keep[mt.s])) # Re-call to remove phasing (required for plink output) # mt = mt.annotate_entries(GT=hl.call(mt.GT[0], mt.GT[1], phased=False)) # Filter on MAF mt = hl.variant_qc(mt) mt = mt.annotate_rows(variant_qc=mt.variant_qc.annotate( MAF=hl.min(mt.variant_qc.AF))) mt = mt.filter_rows(mt.variant_qc.MAF >= maf_threshold) # Liftover mt = mt.annotate_rows(locus_GRCh38=hl.liftover(mt.locus, 'GRCh38')) # Strip chr from contig name (causes problems with GCTA) mt = mt.annotate_rows( contig_GRCh38=mt.locus_GRCh38.contig.replace('chr', '')) # Swap GRCh37 locus for GRCh38 (but have to use rg38_custom) mt = mt.key_rows_by() mt = mt.annotate_rows(locus=hl.locus(mt.contig_GRCh38, mt.locus_GRCh38.position, reference_genome=rg38_custom)) mt = mt.key_rows_by(mt.locus, mt.alleles) # Remove rows with missing locus (after liftover) mt = mt.filter_rows(hl.is_defined(mt.locus)) # Write plink format hl.export_plink(dataset=mt, output=out_plink.format(chrom=chrom)) return 0
def test_init_hail_context_twice(self): hl.init(hl.spark_context(), idempotent=True) # Should be no error
# coding: utf-8 import hail as hl import hail.expr.aggregators as agg import numpy as np import matplotlib.pyplot as plt from math import log, isnan from pprint import pprint import time hl.init() # Initialize Hail and Spark. import matplotlib.pyplot as plt import numpy as np import pandas as pd from sklearn import datasets, linear_model from sklearn.metrics import mean_squared_error, r2_score # ## key step # ### 1. extract pca info, transform it to dataframe # ### 2. build linear regression model, predict y and get y residuals # ### 3. store y residuals in hail MatrixTable # ### 4. run gwas and compare time # Import a PLINK dataset (BED, BIM, FAM) as a MatrixTable vds = hl.import_plink('gs://ukb_testdata/maf_0.01_10.bed', 'gs://ukb_testdata/maf_0.01_10.bim', 'gs://ukb_testdata/maf_0.01_10.fam') # Import delimited text file (text table) as Table # import phenotype table = (hl.import_table('gs://ukb_testdata/sleep_duration.tsv', delimiter='\t',
def main(args): hl.init(log="/variant_qc_evaluation.log") if args.create_bin_ht: create_bin_ht( args.model_id, args.n_bins, ).write( get_score_bins(args.model_id, aggregated=False).path, overwrite=args.overwrite, ) if args.run_sanity_checks: ht = get_score_bins(args.model_id, aggregated=False).ht() logger.info("Running sanity checks...") print( ht.aggregate( hl.struct( was_biallelic=hl.agg.counter(~ht.was_split), has_biallelic_rank=hl.agg.counter( hl.is_defined(ht.biallelic_bin)), was_singleton=hl.agg.counter(ht.singleton), has_singleton_rank=hl.agg.counter( hl.is_defined(ht.singleton_bin)), was_biallelic_singleton=hl.agg.counter(ht.singleton & ~ht.was_split), has_biallelic_singleton_rank=hl.agg.counter( hl.is_defined(ht.biallelic_singleton_bin)), ))) if args.create_aggregated_bin_ht: logger.warning( "Use only workers, it typically crashes with preemptibles") create_aggregated_bin_ht(args.model_id).write( get_score_bins(args.model_id, aggregated=True).path, overwrite=args.overwrite, ) if args.extract_truth_samples: logger.info(f"Extracting truth samples from MT...") mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True, remove_hard_filtered_samples=False) mt = mt.filter_cols( hl.literal([v["s"] for k, v in TRUTH_SAMPLES.items()]).contains(mt.s)) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) # Checkpoint to prevent needing to go through the large table a second time mt = mt.checkpoint( get_checkpoint_path("truth_samples", mt=True), overwrite=args.overwrite, ) for truth_sample in TRUTH_SAMPLES: truth_sample_mt = mt.filter_cols( mt.s == TRUTH_SAMPLES[truth_sample]["s"]) # Filter to variants in truth data truth_sample_mt = truth_sample_mt.filter_rows( hl.agg.any(truth_sample_mt.GT.is_non_ref())) truth_sample_mt.naive_coalesce(args.n_partitions).write( get_callset_truth_data(truth_sample).path, overwrite=args.overwrite, ) if args.merge_with_truth_data: for truth_sample in TRUTH_SAMPLES: logger.info( f"Creating a merged table with callset truth sample and truth data for {truth_sample}..." ) # Load truth data mt = get_callset_truth_data(truth_sample).mt() truth_hc_intervals = TRUTH_SAMPLES[truth_sample][ "hc_intervals"].ht() truth_mt = TRUTH_SAMPLES[truth_sample]["truth_mt"].mt() truth_mt = truth_mt.key_cols_by( s=hl.str(TRUTH_SAMPLES[truth_sample]["s"])) # Remove low quality sites info_ht = get_info(split=True).ht() mt = mt.filter_rows(~info_ht[mt.row_key].AS_lowqual) ht = create_truth_sample_ht(mt, truth_mt, truth_hc_intervals) ht.write( get_callset_truth_data(truth_sample, mt=False).path, overwrite=args.overwrite, ) if args.bin_truth_sample_concordance: for truth_sample in TRUTH_SAMPLES: logger.info( f"Creating binned concordance table for {truth_sample} for model {args.model_id}" ) ht = get_callset_truth_data(truth_sample, mt=False).ht() info_ht = get_info(split=True).ht() ht = ht.filter( ~info_ht[ht.key].AS_lowqual & ~hl.is_defined(telomeres_and_centromeres.ht()[ht.locus])) logger.info("Filtering out low confidence regions and segdups...") ht = filter_low_conf_regions( ht, filter_lcr=True, # TODO: Uncomment when we have decoy path filter_decoy=False, # True, filter_segdup=True, ) logger.info( "Loading HT containing RF or VQSR scores annotated with a bin based on the rank of score..." ) metric_ht = get_score_bins(args.model_id, aggregated=False).ht() ht = ht.filter(hl.is_defined(metric_ht[ht.key])) ht = ht.annotate(score=metric_ht[ht.key].score) ht = compute_binned_truth_sample_concordance( ht, metric_ht, args.n_bins) ht.write( get_binned_concordance(args.model_id, truth_sample).path, overwrite=args.overwrite, )
get_expr_for_variant_loftee_flag_flag, get_expr_for_genes_with_loftee_flag_flag, get_expr_for_ref_allele, get_expr_for_start_pos, get_expr_for_variant_id, get_expr_for_vep_sorted_transcript_consequences_array, get_expr_for_xpos, ) p = argparse.ArgumentParser() p.add_argument("--input-url", help="URL of gnomAD 2.1 flattened Hail table to export", required=True) p.add_argument("--output-url", help="URL to write shaped Hail table to", required=True) p.add_argument("--subset", help="Filter variants to this chrom:start-end range") args = p.parse_args() hl.init(log="/tmp/hail.log") ds = hl.read_table(args.input_url) # The globals in the flattened Hail table cause a serialization error during export to ES. ds = ds.select_globals() if args.subset: subset_interval = hl.parse_locus_interval(args.subset) ds = ds.filter(subset_interval.contains(ds.locus)) #################### # Top level fields # #################### # These fields remain at the top level
def main(args): hl.init(log='/frequency_data_generation.log', default_reference='GRCh38') logger.info("Reading sparse MT and metadata table...") mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True) meta_ht = meta.ht().select('pop', 'sex', 'project_id', 'release', 'sample_filters') if args.test: logger.info("Filtering to chr20:1-1000000") mt = hl.filter_intervals(mt, [hl.parse_locus_interval('chr20:1-1000000')]) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) logger.info("Annotating sparse MT with metadata...") mt = mt.annotate_cols(meta=meta_ht[mt.s]) mt = mt.filter_cols(mt.meta.release) samples = mt.count_cols() logger.info(f"Running frequency table prep and generation pipeline on {samples} samples") logger.info("Computing adj and sex adjusted genotypes.") mt = mt.annotate_entries( GT=adjusted_sex_ploidy_expr(mt.locus, mt.GT, mt.meta.sex), adj=get_adj_expr(mt.GT, mt.GQ, mt.DP, mt.AD) ) logger.info("Densify-ing...") mt = hl.experimental.densify(mt) mt = mt.filter_rows(hl.len(mt.alleles) > 1) logger.info("Setting het genotypes at sites with >1% AF (using v3.0 frequencies) and > 0.9 AB to homalt...") # hotfix for depletion of homozygous alternate genotypes # Using v3.0 AF to avoid an extra frequency calculation # TODO: Using previous callset AF works for small incremental changes to a callset, but we need to revisit for large increments freq_ht = freq.versions["3"].ht() freq_ht = freq_ht.select(AF=freq_ht.freq[0].AF) mt = mt.annotate_entries( GT=hl.cond( (freq_ht[mt.row_key].AF > 0.01) & mt.GT.is_het() & (mt.AD[1] / mt.DP > 0.9), hl.call(1, 1), mt.GT, ) ) logger.info("Calculating InbreedingCoefficient...") # NOTE: This is not the ideal location to calculate this, but added here to avoid another densify mt = mt.annotate_rows(InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT)) logger.info("Generating frequency data...") mt = annotate_freq( mt, sex_expr=mt.meta.sex, pop_expr=mt.meta.pop ) # Select freq, FAF and popmax faf, faf_meta = faf_expr(mt.freq, mt.freq_meta, mt.locus, POPS_TO_REMOVE_FOR_POPMAX) mt = mt.select_rows( 'InbreedingCoeff', 'freq', faf=faf, popmax=pop_max_expr(mt.freq, mt.freq_meta, POPS_TO_REMOVE_FOR_POPMAX) ) mt = mt.annotate_globals(faf_meta=faf_meta) # Annotate quality metrics histograms, as these also require densifying mt = mt.annotate_rows( **qual_hist_expr(mt.GT, mt.GQ, mt.DP, mt.AD) ) logger.info("Writing out frequency data...") if args.test: mt.rows().write("gs://gnomad-tmp/gnomad_freq/chr20_1_1000000_freq.ht", overwrite=True) else: mt.rows().write(freq.path, overwrite=args.overwrite)
#conda activate hail #cd /Users/mzekavat/opt/anaconda3/envs/hail #hailctl dataproc start mz02 --master-machine-type n1-highmem-16 --worker-machine-type n1-highmem-16 --worker-boot-disk-size 200 --num-workers 3 --num-preemptible-workers 3 --master-boot-disk-size 100 --region us-east1 --zone us-east1-d --requester-pays-allow-all --properties "spark:spark.driver.memory=90G,spark:spark.driver.maxResultSize=50G,spark:spark.kryoserializer.buffer.max=1G,spark:spark.task.maxFailures=20,spark:spark.driver.extraJavaOptions=-Xss4M,spark:spark.executor.extraJavaOptions=-Xss4M,spark:spark.speculation=true" #hailctl dataproc connect mz02 notebook --zone us-east1-d --region us-east1 #hailctl dataproc submit --zone us-east1-d --region us-east1 mz02 ~/Documents/Broad_2015_17/Python_Scripts_Hail/CHIP/Merge_SomaticVCFS_15000_30000.py import hail as hl import hail.expr.aggregators as agg hl.init(default_reference="GRCh38") import numpy as np import pandas as pd from collections import Counter from math import log, isnan from pprint import pprint import time from bokeh.io import show, output_notebook from bokeh.layouts import gridplot output_notebook() recoding_dict = {f"{i + 1}": f"chr{i + 1}" for i in range(22)} recoding_dict['X'] = 'chrX' recoding_dict['Y'] = 'chrY' files = hl.import_table('gs://maryam_lipids/UKBB_CHIP/filenames.txt', impute=True, no_header=True) files_list = [row['f0'] for row in files.select(files.f0).collect()] for num in range(1, 10000): print(num) filenamev2 = files_list[num].strip() mt = hl.import_vcf(filenamev2,
def main(args): hl.init(log="/variant_qc_random_forest.log") if args.list_rf_runs: logger.info(f"RF runs:") pretty_print_runs(get_rf_runs(rf_run_path())) if args.annotate_for_rf: ht = create_rf_ht( impute_features=args.impute_features, adj=args.adj, n_partitions=args.n_partitions, checkpoint_path=get_checkpoint_path("rf_annotation"), ) ht.write( get_rf_annotations(args.adj).path, overwrite=args.overwrite, ) logger.info(f"Completed annotation wrangling for random forests model training") if args.train_rf: model_id = f"rf_{str(uuid.uuid4())[:8]}" rf_runs = get_rf_runs(rf_run_path()) while model_id in rf_runs: model_id = f"rf_{str(uuid.uuid4())[:8]}" ht, rf_model = train_rf( get_rf_annotations(args.adj).ht(), fp_to_tp=args.fp_to_tp, num_trees=args.num_trees, max_depth=args.max_depth, no_transmitted_singletons=args.no_transmitted_singletons, no_inbreeding_coeff=args.no_inbreeding_coeff, vqsr_training=args.vqsr_training, vqsr_model_id=args.vqsr_model_id, filter_centromere_telomere=args.filter_centromere_telomere, test_intervals=args.test_intervals, ) ht = ht.checkpoint( get_rf_training(model_id=model_id).path, overwrite=args.overwrite, ) logger.info("Adding run to RF run list") rf_runs[model_id] = get_run_data( input_args={ "transmitted_singletons": None if args.vqsr_training else not args.no_transmitted_singletons, "adj": args.adj, "vqsr_training": args.vqsr_training, "filter_centromere_telomere": args.filter_centromere_telomere, }, test_intervals=args.test_intervals, features_importance=hl.eval(ht.features_importance), test_results=hl.eval(ht.test_results), ) with hl.hadoop_open(rf_run_path(), "w") as f: json.dump(rf_runs, f) logger.info("Saving RF model") save_model( rf_model, get_rf_model_path(model_id=model_id), overwrite=args.overwrite, ) else: model_id = args.model_id if args.apply_rf: logger.info(f"Applying RF model {model_id}...") rf_model = load_model(get_rf_model_path(model_id=model_id)) ht = get_rf_training(model_id=model_id).ht() features = hl.eval(ht.features) ht = apply_rf_model(ht, rf_model, features, label=LABEL_COL) logger.info("Finished applying RF model") ht = ht.annotate_globals(rf_model_id=model_id) ht = ht.checkpoint( get_rf_result(model_id=model_id).path, overwrite=args.overwrite, ) ht_summary = ht.group_by( "tp", "fp", TRAIN_COL, LABEL_COL, PREDICTION_COL ).aggregate(n=hl.agg.count()) ht_summary.show(n=20)
def main(args): data_type = "exomes" if args.exomes else "genomes" hl.init(log=f"/ccdg_sample_qc_{data_type}.log") # gcloud compute scp wlu-m:/hard_filter_genomes.log . if args.sample_qc: compute_sample_qc(data_type).write( get_ccdg_results_path(data_type=data_type, result="sample_qc_all"), overwrite=args.overwrite, ) if args.impute_sex: compute_sex(data_type).write( get_ccdg_results_path(data_type=data_type, result="sex"), overwrite=args.overwrite, ) # elif args.reannotate_sex: # reannotate_sex( # args.min_cov, # (args.upper_x, (args.lower_xx, args.upper_xx), args.lower_xxx), # ((args.lower_y, args.upper_y), args.lower_yy), # ).write( # get_ccdg_results_path(data_type=data_type, result="sex"), # overwrite=args.overwrite, # ) ##### Wait for more information # if args.compute_hard_filters: # compute_hard_filters(args.min_cov).write( # hard_filtered_samples.path, overwrite=args.overwrite # ) if args.run_pc_relate or args.reannotate_relatedness: if args.run_pc_relate: logger.warning( "PC-relate requires SSDs and doesn't work with preemptible workers!" ) relatedness_ht = compute_relatedness( data_type, overwrite=args.overwrite, ) else: relatedness_ht = hl.read_table( get_ccdg_results_path(data_type=data_type, result="relatedness") ).checkpoint( "gs://ccdg/tmp/relatedness_ht_checkpoint.ht", overwrite=True ) # Copy HT to temp location to overwrite annotation relatedness_ht = annotate_relatedness( relatedness_ht, first_degree_kin_thresholds=tuple(args.first_degree_kin_thresholds), second_degree_min_kin=args.second_degree_kin_cutoff, ibd0_0_max=args.ibd0_0_max, ) relatedness_ht.write( get_ccdg_results_path(data_type=data_type, result="relatedness"), overwrite=args.overwrite, ) if args.compute_related_samples_to_drop: relatedness_ht = hl.read_table( get_ccdg_results_path(data_type=data_type, result="relatedness") ) related_samples_to_remove = hl.maximal_independent_set( relartedness_ht.i, pairs.j, False ).checkpoint( get_ccdg_results_path(data_type=data_type, result="related_samples"), overwrite=args.overwrite, ) if args.update_variant_filtered_pca_mt: pca_var_ht = hl.read_table(get_pca_variants_path()) mt = hl.vds.to_dense_mt(get_qc_vds(data_type, split=True)) mt = mt.filter_rows(hl.is_defined(pca_var_ht[mt.row_key])).checkpoint( get_pca_variants_path(ld_pruned=True, data=f"ccdg_{data_type}", mt=True), overwrite=args.overwrite, _read_if_exists=(not args.overwrite), ) if args.run_pc_project: ## TODO: Rank samples and hard filter samples mt = hl.read_matrix_table( get_pca_variants_path(ld_pruned=True, data=f"ccdg_{data_type}", mt=True) ) pca_loadings = hl.read_table(path_to_gnomad_loadings) pca_ht = hl.experimental.pc_project( mt.GT, pca_loadings.loadings, pca_loadings.pca_af, ) pca_ht.checkpoint( get_ccdg_results_path( data_type=data_type, result="gnomad_pc_project_scores" ), overwrite=args.overwrite, ) # related_ht = hl.read_table( # get_ccdg_results_path(data_type=data_type, result="related_samples") # ) # # related_mt = mt.filter_cols(hl.is_defined(related_mt[mt.col_key]), keep=True) # pca_mt = mt.filter_cols(hl.is_defined(related_mt[mt.col_key]), keep=False) # pca_ht = hl.experimental.pc_project( # pca_mt.GT, pca_loadings.loadings, pca_loadings.pca_af # ) # pca_mt = pca_mt.annotate_cols(scores=pca_ht[pca_mt.col_key].scores) # # related_ht = hl.experimental.pc_project( # related_mt.GT, pca_loadings.loadings, pca_loadings.pca_af # ) # related_mt = related_mt.annotate_cols( # scores=related_ht[related_mt.col_key].scores # ) if args.assign_pops: with hl.hadoop_open( path_to_gnomad_rf, "rb", ) as f: fit = pickle.load(f) # Reduce the scores to only those used in the RF model, this was 6 for v2 and 16 for v3.1 n_pcs = fit.n_features_ pca_ht = hl.read_table( get_ccdg_results_path( data_type=data_type, result="gnomad_pc_project_scores" ) ) pca_ht = pca_ht.annotate(scores=pca_ht.scores[:n_pcs]) pop_ht, rf_model = assign_population_pcs( pca_ht, pc_cols=pca_ht.scores, fit=fit, ) pop_ht = pop_ht.checkpoint( get_ccdg_results_path(data_type=data_type, result="pop_assignment"), overwrite=args.overwrite, _read_if_exists=not args.overwrite, ) pop_ht.transmute( **{f"PC{i + 1}": pop_ht.pca_scores[i] for i in range(n_pcs)} ).export( get_ccdg_results_path(data_type=data_type, result="pop_assignment")[:-2] + "tsv" ) with hl.hadoop_open( get_ccdg_results_path(data_type=data_type, result="pop_RF_fit")[:-2] + "pickle", "wb", ) as out: pickle.dump(rf_model, out) if args.calculate_inbreeding: qc_mt = hl.read_matrix_table( get_pca_variants_path(ld_pruned=True, data=f"ccdg_{data_type}", mt=True) ) pop_ht = hl.read_table( get_ccdg_results_path(data_type=data_type, result="pop_assignment"), ) qc_mt = qc_mt.annotate_cols(pop=pop_ht[qc_mt.col_key].pop) qc_mt = qc_mt.annotate_rows( call_stats_by_pop=hl.agg.group_by( qc_mt.pop, hl.agg.call_stats(qc_mt.GT, qc_mt.alleles) ) ) inbreeding_ht = ( qc_mt.annotate_cols( inbreeding=hl.agg.inbreeding( qc_mt.GT, qc_mt.call_stats_by_pop[qc_mt.pop].AF[1] ) ) .cols() .select("inbreeding") ) inbreeding_ht.write( get_ccdg_results_path(data_type=data_type, result="inbreeding"), overwrite=args.overwrite, ) if args.apply_stratified_filters or args.apply_regressed_filters: filtering_qc_metrics = args.filtering_qc_metrics.split(",") sample_qc_ht = hl.read_table( get_ccdg_results_path(data_type=data_type, result="sample_qc_bi_allelic") ) pc_scores = hl.read_table( get_ccdg_results_path(data_type=data_type, result="pc_scores") ) sample_qc_ht = sample_qc_ht.select( scores=pc_scores[sample_qc_ht.key]["scores"], ) pop_ht = hl.read_table( get_ccdg_results_path(data_type=data_type, result="pop_assignment"), ) if "inbreeding" in filtering_qc_metrics: inbreeding_ht = hl.read_table( get_ccdg_results_path(data_type=data_type, result="inbreeding") )[sample_qc_ht.key] sample_qc_ht = sample_qc_ht.annotate( inbreeding=inbreeding_ht.inbreeding.f_stat ) if args.apply_regressed_filters: n_pcs = args.regress_n_pcs residuals_ht = compute_qc_metrics_residuals( ht=sample_qc_ht, pc_scores=sample_qc_ht.scores[:n_pcs], qc_metrics={ metric: sample_qc_ht[metric] for metric in filtering_qc_metrics }, ) residuals_ht = residuals_ht.filter( hl.is_missing(hard_filtered_samples.ht()[residuals_ht.key]) ) stratified_metrics_ht = compute_stratified_metrics_filter( ht=residuals_ht, qc_metrics=dict(residuals_ht.row_value), metric_threshold={ "n_singleton_residual": (math.inf, 8.0), "r_het_hom_var_residual": (math.inf, 4.0), }, ) residuals_ht = residuals_ht.annotate( **stratified_metrics_ht[residuals_ht.key] ) residuals_ht = residuals_ht.annotate_globals( **stratified_metrics_ht.index_globals(), n_pcs=n_pcs, ) else: logger.info( "Computing stratified QC metrics filters using metrics: " + ", ".join(filtering_qc_metrics) ) sample_qc_ht = sample_qc_ht.annotate(qc_pop=pop_ht[sample_qc_ht.key].pop) # TODO: compute hard-filtered samples sample_qc_ht = sample_qc_ht.filter( hl.is_missing(hard_filtered_samples.ht()[sample_qc_ht.key]) ) stratified_metrics_ht = compute_stratified_metrics_filter( sample_qc_ht, qc_metrics={ metric: sample_qc_ht[metric] for metric in filtering_qc_metrics }, strata={"qc_pop": sample_qc_ht.qc_pop}, metric_threshold={"n_singleton": (4.0, 8.0)}, )
def tabix(b, ss_path, out_dir): r''' tabix's a bgz file with gcloud path `path` using Batch `b` ''' fname = ss_path.split('/')[-1] f = b.read_input(ss_path) j = b.new_job(name=fname.split('.')[0]) j.command(f'tabix -s 1 -b 2 -e 2 -c chr {f}' ) # treat header (which begins with "chr") as a comment j.command(f'mv {f}.tbi {j.ofile}') b.write_output(j.ofile, f'{out_dir}/{fname}.tbi') if __name__ == "__main__": hl.init(log='/Users/nbaya/Downloads/tabix_sumstats.log') backend = hb.ServiceBackend(billing_project='ukb_diverse_pops', bucket='ukbb-diverse-temp-30day/nb-batch-tmp') b = hb.Batch( name='tabix', backend=backend, default_image='gcr.io/ukbb-diversepops-neale/nbaya_tabix:latest', default_storage='100M', # works with 2G default_cpu=1) # sumstats_dir = f'{bucket}/sumstats_flat_files' # sumstats_dir = f'{ldprune_dir}/export_results/update' # sumstats_dir = f'{ldprune_dir}/loo/sumstats/batch1' sumstats_dir = f'{ldprune_dir}/variant_qc' print(f'\nUsing sumstats from {sumstats_dir}')
from hail.expr.expressions import * from hail.expr.expressions import Expression from hail.typecheck import * from hail import Table import hail from google.cloud import storage storage.Client() client = storage.Client() import gcsfs fs = gcsfs.GCSFileSystem(project='your-project') bucket = client.get_bucket('your-bucket') import hail as hl import hail.expr.aggregators as agg hl.init() #read mt file mt = hl.read_matrix_table( "gs://1k_genome/1000-genomes/VDS-of-all/ALL.chr.integrated_phase1_v3.20101123.snps_indels_svs.genotypes.mt" ) #print(mt.count()) (39706715, 1092) #filter MAF mt = hl.variant_qc(mt) mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01) #print(mt.count()) (13404583, 1092) #filter only SNPs mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1])) #print(mt.count()) (12194564, 1092)
def main(): parser = argparse.ArgumentParser() # reference args parser.add_argument( '--ref-dirname', default= 'gs://hgdp-1kg/hgdp_tgp/datasets_for_others/lindo/ds_without_outliers/' ) parser.add_argument('--ref-basename', default='unrelated') parser.add_argument( '--ref-info', default= 'gs://hgdp-1kg/hgdp_tgp/gwaspy_pca_ref/hgdp_1kg_sample_info.unrelateds.pca_outliers_removed.with_project.tsv' ) parser.add_argument('--reference', type=str, default='GRCh38') parser.add_argument('--pca-type', type=str, default='normal', choices=['normal', 'project', 'joint']) # data args parser.add_argument('--data-dirname', type=str, required=True) parser.add_argument('--data-basename', type=str, required=True) parser.add_argument('--input-type', type=str, required=True, choices=['vcf', 'plink', 'hail']) # filter args parser.add_argument('--maf', type=float, default=0.05, help='include only SNPs with MAF >= NUM in PCA') parser.add_argument('--hwe', type=float, default=1e-3, help='include only SNPs with HWE >= NUM in PCA') parser.add_argument('--geno', type=float, default=0.98, help='include only SNPs with call-rate > NUM') parser.add_argument( '--ld-cor', type=float, default=0.2, choices=range(0, 1), metavar="[0.0-1.0]", help= 'Squared correlation threshold (exclusive upper bound). Must be in the range [0.0, 1.0]' ) parser.add_argument( '--ld-window', type=int, default=250000, help='Window size in base pairs (inclusive upper bound)') parser.add_argument('--npcs', type=int, default=20, help='Number of PCs to use') parser.add_argument('--relatedness-method', type=str, default='pc_relate', choices=['pc_relate', 'ibd', 'king'], help='Method to use for the inference of relatedness') parser.add_argument('--relatedness-thresh', type=float, default=0.98, help='Threshold value to use in relatedness checks') parser.add_argument( '--prob', type=float, default=0.8, help= 'Minimum probability of belonging to a given population for the population to be set' ) parser.add_argument('--out-dir', type=str, required=True) args = parser.parse_args() if not args.prob: print(f'No prob value specified, {args.prob} will be used') hl.init(default_reference=args.reference) pca(ref_dirname=args.ref_dirname, ref_basename=args.ref_basename, ref_info=args.ref_info, reference=args.reference, pca_type=args.pca_type, input_type=args.input_type, data_dirname=args.data_dirname, data_basename=args.data_basename, maf=args.maf, hwe=args.hwe, call_rate=args.geno, ld_cor=args.ld_cor, ld_window=args.ld_window, n_pcs=args.npcs, relatedness_method=args.relatedness_method, relatedness_thresh=args.relatedness_thresh, prob_threshold=args.prob, out_dir=args.out_dir) print('\nDone running PCA')
) ht = ht.annotate( validated_denovo_inheritance=ht_val_filtered[ht.key].inheritance) ht.write( f'{lustre_dir}/variant_qc/models/{run_hash}_rf_result_FINAL_for_RANKING_100_trios.ht', overwrite=True) if __name__ == "__main__": # need to create spark cluster first before intiialising hail sc = pyspark.SparkContext() # Define the hail persistent storage directory hl.init(sc=sc, tmp_dir=lustre_dir, local_tmpdir=lustre_dir, default_reference="GRCh38") # s3 credentials required for user to access the datasets in farm flexible compute s3 environment # you may use your own here from your .s3fg file in your home directory hadoop_config = sc._jsc.hadoopConfiguration() hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"]) hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"]) ################################ ################################# main()
import hail as hl import sys import timeit start = timeit.default_timer() chrom = str(sys.argv[1]) hl.init(log='/hail.log', min_block_size=2048, default_reference='GRCh38') #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # define files #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # input vds_ldpruned_common_file = 'gs://ccdg-qc-multi/vds/qced/' + chrom + '/ldpruned_common.vds' vds_1kg_file = 'gs://ccdg-qc-multi/data/1000genomes/vds/hail2_ALL.GRCh38.genotypes.20170504.vds' mhc_chr8inv_file = 'gs://ccdg-qc-multi/data/MHC_invchr8_longLDreg_liftover_to_GRCh38.txt' rel_exclusion_file = 'gs://ccdg-qc-multi/out/king/' + chrom + '/ibd_greater_0884_' + chrom + '.txt' samples_to_keep_file = 'gs://ccdg-qc-multi/qc_measures/' + chrom + '/01_sample_qc_keep.txt' # output pca_value_file = 'gs://ccdg-qc-multi/qc_measures/pca/' + chrom + '/pca_values.tsv' pca_score_file = 'gs://ccdg-qc-multi/qc_measures/pca/' + chrom + '/pca_scores.tsv' #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # read data #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## interval list #mhc_chr8inv = hl.import_table(mhc_chr8inv_file, no_header=True).key_by('f0')
sys.exit(1) else: filter_constraint = True if not os.path.isfile(args.vep_config): logger.error(f"Could not find vep config file {args.vep_config}") sys.exit(1) # Prepare output path if not os.path.exists(os.path.abspath(os.path.dirname(args.output))): os.makedirs(os.path.abspath(os.path.dirname(args.output))) # Set hail temporary path hl.init( idempotent=True, tmp_dir=args.tmp_dir, log=os.path.join(args.tmp_dir, 'hail.log'), ) ## # Main script # logger.info(f"Reading pedigree file {args.fam}") pedigree = hl.Pedigree.read(args.fam) logger.info(f"Importing vcf file {args.vcf}") data = hl.import_vcf(args.vcf, call_fields=['GT'], skip_invalid_loci=True, force_bgz=True) data = hl.split_multi_hts(data)
""" Annotate variant HailTable with allelic frequencies from different (external) sources (e.g., gnomad exomes and genomes) """ import hail as hl from utils.data_utils import (get_gnomad_genomes_v3_af_ht, get_bonn_af_ht, get_germ_af_ht, get_rum_af_ht, get_vep_annotation_ht) from utils.generic import current_date hl.init(default_reference='GRCh38') nfs_dir = 'file:///home/ubuntu/data' nfs_tmp = 'file:///home/ubuntu/data/tmp' hdfs_dir = 'hdfs://spark-master:9820/dir/hail_data' ## import variant table variant_ht = get_vep_annotation_ht() ## import af tables # In-hause german allelic frequencies (Tuebingen) ht_ger_af = get_germ_af_ht() # In-hause german allelic frequencies (Bonn) bonn_af = get_bonn_af_ht()
def main(args): hl.init() data_type = "genomes" if args.genomes else "exomes" if not args.skip_write_qc_mt: logger.info("Importing data...") # 1h40 for exomes, 3h20 for genomes mt = get_gnomad_data( data_type, raw=True, split=False ) # NOTE: using full calls since hardcalls doesn't exist at this stage logger.info( "Filtering to bi-allelic, high-callrate, common SNPs for sample QC..." ) mt = mt.filter_rows((hl.len(mt.alleles) == 2) & hl.is_snp(mt.alleles[0], mt.alleles[1]) & (hl.agg.mean(mt.GT.n_alt_alleles()) / 2 > 0.001) & (hl.agg.fraction(hl.is_defined(mt.GT)) > 0.99)) mt.annotate_cols(callrate=hl.agg.fraction(hl.is_defined( mt.GT))).naive_coalesce(5000).write(qc_mt_path(data_type), overwrite=args.overwrite) qc_mt = hl.read_matrix_table(qc_mt_path(data_type)) logger.info("Importing metadata...") meta_ht = hl.import_table(qc_meta_path(data_type), impute=True, types={ 'age': hl.tfloat64 }).key_by('s') qc_mt = qc_mt.annotate_cols(**meta_ht[qc_mt.s]) logger.info("Inferring sex...") qc_ht = annotate_sex(qc_mt, qc_temp_data_prefix(data_type), male_threshold=0.8 if args.genomes else 0.6).cols() # Flag Klinefelter's individuals and samples with sex aneuploidies if args.exomes: qc_ht = qc_ht.annotate( ambiguous_sex=((qc_ht.f_stat >= 0.5) & (hl.is_defined(qc_ht.normalized_y_coverage) & (qc_ht.normalized_y_coverage <= 0.1))) | (hl.is_missing(qc_ht.f_stat)) | ((qc_ht.f_stat >= 0.4) & (qc_ht.f_stat <= 0.6) & (hl.is_defined(qc_ht.normalized_y_coverage) & (qc_ht.normalized_y_coverage > 0.1))), sex_aneuploidy=(qc_ht.f_stat < 0.4) & hl.is_defined(qc_ht.normalized_y_coverage) & (qc_ht.normalized_y_coverage > 0.1)) else: qc_ht = qc_ht.annotate(ambiguous_sex=hl.is_missing(qc_ht.is_female)) logger.info("Annotating samples failing hard filters...") if args.exomes: sex_expr = (hl.case().when(qc_ht.ambiguous_sex, "ambiguous_sex").when( qc_ht.sex_aneuploidy, "sex_aneuploidy").when(qc_ht.is_female, "female").default("male")) else: sex_expr = (hl.case().when(qc_ht.ambiguous_sex, "ambiguous_sex").when( qc_ht.is_female, "female").default("male")) qc_ht = qc_ht.annotate( hard_filters=make_hard_filters_expr(qc_ht, data_type), perm_filters=make_perm_filters_expr(qc_ht, data_type), sex=sex_expr, data_type=data_type).key_by('data_type', 's') qc_ht.write(qc_ht_path(data_type, 'hard_filters'), overwrite=args.overwrite) # Export annotations to make rank list for relatedness (in final sample QC) if args.exomes: colnames = ['internal', 'project_id', 'pct_bases_20x', 'perm_filters'] else: colnames = ['pcr_free', 'mean_dp', 'perm_filters'] rank_ht = qc_ht.filter(hl.len(qc_ht.hard_filters) == 0, keep=True).select(*colnames) (rank_ht.annotate(releasable=( hl.len(rank_ht.perm_filters) == 0)).drop('perm_filters').export( rank_annotations_path(data_type))) # Check numbers: qc_ht = hl.read_table(qc_ht_path(data_type, 'hard_filters')) sample_count = qc_ht.count() checkpoint1a = qc_ht.aggregate( hl.agg.count_where(hl.len(qc_ht['hard_filters']) == 0)) checkpoint1b = qc_ht.aggregate( hl.agg.count_where((hl.len(qc_ht['hard_filters']) == 0) & (hl.len(qc_ht.perm_filters) == 0))) logger.info('{} samples found before filtering'.format(sample_count)) logger.info('{} samples found after checkpoint 1a (hard filters)'.format( checkpoint1a)) logger.info( '{} samples found after checkpoint 1b (hard filters + permissions)'. format(checkpoint1b))