def create_full_results_file(prune, overwrite=False): r''' Concatenates PRS-phentype regression results into a single table. ''' reg_path_regex = prs_dir + f'prs_phen_reg.*.*.n_remove_{int(n_remove_per_sex)}.seed_*.{"" if prune else "not_"}pruned*.tsv' ls = hl.hadoop_ls(reg_path_regex) reg_paths = sorted([f['path'] for f in ls]) df_list = [] for reg_path in reg_paths: with hl.hadoop_open(reg_path) as f: df_list.append(pd.read_csv(f, sep='\t')) df = pd.concat(df_list, sort=False) df.insert(1, 'phen_desc', df.phen.astype(str).apply(lambda x: phen_dict[x][0]) ) # add phenotype description to dataframe all_reg_results_path = prs_dir + f'prs_phen_reg.all_phens.n_remove_{int(n_remove_per_sex)}.{"" if prune else "not_"}pruned.tsv' if hl.hadoop_is_file(all_reg_results_path) and not overwrite: print('\n... Full PRS-phen regression results already written! ...') print(all_reg_results_path) else: print('\n... Writing PRS-phen regression results ...') print(all_reg_results_path) with hl.hadoop_open(all_reg_results_path, 'w') as f: df.to_csv(f, sep='\t', index=False)
def run_assign_population_pcs( pop_pc_table: hl.Table, outfile: str, picklefile: str, pcs: List[int], fit: RandomForestClassifier = None, seed: int = 42) -> Tuple[hl.Table, RandomForestClassifier]: """ :param Table pop_pc_table: Table containing population PCs ('PC<n>') as well as a column 'known_pop' with population labels :param str outfile: filepath to tsv with input samples and imputed population labels :param str picklefile: filepath to which the pickled random forest model is written :param list of int pcs: 1-based list of PCs to train the model on :param RandomForestClassifier fit: fit from a previously trained random forest model (i.e., the output from a previous RandomForestClassifier() call) :param int seed: Random seed :return: Table containing sample IDs and imputed population labels, trained random forest model :rtype: Table, RandomForestClassifier """ data = pop_pc_table.to_pandas() data = expand_pd_array_col(data, 'scores', max(pcs), 'PC') new_data, pop_clf = assign_population_pcs( data, pc_cols=['PC{}'.format(pc) for pc in pcs], fit=fit, seed=seed) if not fit: # Pickle RF with hl.hadoop_open(picklefile, 'wb') as out: pickle.dump(pop_clf, out) with hl.hadoop_open(outfile, 'w') as out: new_data.to_csv(out, sep="\t", na_rep="NA", index=False) return hl.import_table(outfile, impute=True).key_by('data_type', 's'), pop_clf
def write(self, path, overwrite): # pylint: disable=unused-argument with hl.hadoop_open(self.path, "r") as input_file: with hl.hadoop_open(path, "w") as output_file: reader = csv.reader(input_file, delimiter="\t") writer = csv.writer(output_file, delimiter="\t", quotechar="'") for row in reader: writer.writerow(row)
def write_functional_pedigree(input_pedigree: str, vcf_samples: list, output_dir: str, output_name: str) -> Tuple[dict, dict, dict]: """ Write a functional pedigree (pedigree with samples not in the VCF removed) and create dictionary of seqr projects, family IDs, and given sex. :param input_pedigree: Pedigree :param vcf_samples: Dictionary of samples found in the VCF :return: Dictionary of project IDs for each sample """ seqr_projects = defaultdict(str) family_ids = defaultdict(str) given_sex = defaultdict(str) out_new_ped = hl.hadoop_open( f"{output_dir}/{output_name}_functioning_pedigree.ped", "w") out_new_ped.write( "Family_ID\tIndividual_ID\tPaternal_ID\tMaternal_ID\tSex\n") with hl.hadoop_open(input_pedigree, "r") as infile: next(infile) for line in infile: line = line.rstrip("\n") items = line.split("\t") ( Project_GUID, Family_ID, Individual_ID, Paternal_ID, Maternal_ID, Sex, ) = items[0:6] if Individual_ID not in vcf_samples: Individual_ID = "." if Paternal_ID not in vcf_samples: Paternal_ID = "." if Maternal_ID not in vcf_samples: Maternal_ID = "." # Only output line from pedigree if the proband is not missing if Individual_ID != ".": seqr_projects[Individual_ID] = Project_GUID family_ids[Individual_ID] = Family_ID given_sex[Individual_ID] = Sex out_new_ped.write( f"{Family_ID}\t{Individual_ID}\t{Paternal_ID}\t{Maternal_ID}\t{Sex}\n" ) out_new_ped.close() return seqr_projects, family_ids, given_sex
def query(): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(FILTERED_VARIANTS) nrows = mt.count_rows() print(f'mt.count_rows() = {nrows}') # Plot the allele frequency fig = figure( title='Variant AF', x_axis_label='Allele Frequency', y_axis_label='Frequency (%)', ) variant_af = mt.variant_qc.AF[1].collect() af_count, edges = np.histogram(variant_af, bins=100, weights=np.ones(len(variant_af)) / len(variant_af)) variant_af_count = pd.DataFrame({ 'variant_af_count': af_count, 'left': edges[:-1], 'right': edges[1:] }) fig.quad( bottom=0, top=variant_af_count['variant_af_count'], left=variant_af_count['left'], right=variant_af_count['right'], fill_color='blue', line_color='black', ) # Add in the cumulative distribution cumulative_af = np.cumsum(af_count) fig.line( x=variant_af_count['right'], y=cumulative_af, color='gray', line_width=1, legend='Cum dist', ) fig.legend.location = 'top_left' fig_filename = output_path('variant_selection_histogram.png', 'web') with hl.hadoop_open(fig_filename, 'wb') as f: get_screenshot_as_png(fig).save(f, format='PNG') html = file_html(fig, CDN, 'my plot') fig_filename_html = output_path('variant_selection_histogram.html', 'web') with hl.hadoop_open(fig_filename_html, 'w') as f: f.write(html)
def combine_gvcfs( gvcf_input_list: str, out_path: str = 'gs://african-seq-data/gambian-genomes/COMBINED_GVCFS/', samples: str = 'gs://african-seq-data/gambian-genomes/sample_names.txt', gvcf_header: str = 'gs://african-seq-data/gambian-genomes/merged-gvcf/SC_GMJOL5309875.alt_bwamem_GRCh38DH.20151208.JOLA.gambian_lowcov/SC_GMJOL5309875.alt_bwamem_GRCh38DH.20151208.JOLA.gambian_lowcov.g.vcf.gz', out_mt_name: str = 'gambian_genomes_merged_gvcfs', temp_bucket: str = 'gs://african-seq-data', reference: str = 'GRCh38', use_genome_default_intervals: bool = True, overwrite: bool = True, key_by_locus_and_alleles: bool = True): """ Combine single-sample GVCFs into a multi-sample matrix table :param gvcf_input_list: MT to filter :param out_path: path to where multi-sample MT will be written :param samples: text file with sample names as they appear in each GVCF to be merged. One sample name per line :param gvcf_header: GVCF file whose header is going to be used as default :param out_mt_name: name to use for output MT :param temp_bucket: bucket for storing intermediate files :param reference: reference genome to use :param use_genome_default_intervals: import GVCFs with uniform partition intervals of default size for whole-genome data :param overwrite: overwrite MT if it exists :param key_by_locus_and_alleles: key by both locus and alleles in the final output """ inputs = [] with hl.hadoop_open(gvcf_input_list, 'r') as f: for line in f: inputs.append(line.strip()) samples_list = [] with hl.hadoop_open(samples, 'r') as f: for line in f: samples_list.append(line.strip()) output_file = f'{out_path}{out_mt_name}.mt' # output destination hl.experimental.run_combiner( inputs, out_file=output_file, tmp_path=temp_bucket, header=gvcf_header, sample_names=samples_list, reference_genome=reference, use_genome_default_intervals=use_genome_default_intervals, overwrite=overwrite, key_by_locus_and_alleles=key_by_locus_and_alleles)
def main(args): full_vcf = hl.read_matrix_table(args.allreads_prefix + '.mt') # liftover chains rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') rg37.add_liftover( 'gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38) chips = hl.hadoop_open(args.chip_loci) chip_dict = {} for chip in chips: chip = chip.strip().split() chip_pos = hl.import_table(chip[1], filter='\[Controls\]', skip_blank_lines=True) chip_pos = chip_pos.filter( hl.array(list(map(str, range(1, 23))) + ['X', 'Y']).contains( chip_pos.chr)) chip_pos = chip_pos.key_by( locus=hl.locus(chip_pos.chr, hl.int(chip_pos.pos))) # liftover chip position info chip_pos = chip_pos.annotate( new_locus=hl.liftover(chip_pos.locus, 'GRCh38')) chip_pos = chip_pos.filter(hl.is_defined(chip_pos.new_locus)) chip_pos = chip_pos.key_by(locus=chip_pos.new_locus) # filter full vcf to sites in genotype data geno_vcf = full_vcf.filter_rows(hl.is_defined( chip_pos[full_vcf.locus])) hl.export_vcf( geno_vcf, 'gs://neurogap/high_coverage/NeuroGap_30x_' + chip[0] + '.vcf.bgz')
def run_population_pca(mt: hl.MatrixTable, build: int, num_pcs=6) -> hl.Table: """ Projects samples onto pre-computed gnomAD and rare disease sample principal components using PCA loadings. A random forest classifier assigns gnomAD and rare disease sample population labels :param mt: QC MatrixTable :param build: 37 or 38 for write path :param pop_fit_path: fit from a previously trained random forest model (i.e., the output from a previous RandomForestClassifier() call) :param num_pcs: Number of PCs to use in model :return: Table annotated with assigned RDG and gnomAD population and PCs :rtype: Table """ loadings = hl.read_table(rdg_gnomad_pop_pca_loadings_path(build)) model_path = rdg_gnomad_rf_model_path() mt = mt.select_entries("GT") scores = pc_project(mt, loadings) scores = scores.annotate( scores=scores.scores[:num_pcs], known_pop="Unknown" ).key_by("s") logger.info("Unpacking RF model") fit = None with hl.hadoop_open(model_path, "rb") as f: fit = pickle.load(f) pop_pca_ht, ignore = assign_population_pcs( scores, pc_cols=scores.scores, output_col="qc_pop", fit=fit ) pop_pca_ht = pop_pca_ht.key_by("s") pop_pcs = {f"pop_PC{i+1}": scores.scores[i] for i in range(num_pcs)} scores = scores.annotate(**pop_pcs).drop("scores", "known_pop") pop_pca_ht = pop_pca_ht.annotate(**scores[pop_pca_ht.key]) return pop_pca_ht
def read_sample_ids(sample_ids_path, start_with_sample_i, n_samples_to_process, n_sample_ids_to_print=10): """Read sample ids file. Args: sample_ids_path (str): sample ids path n_sample_ids_to_print (int): log no more than this many sample ids to stdout. Return: list: sample id strings """ sample_ids = [] with hl.hadoop_open(sample_ids_path) if sample_ids_path.startswith( "gs://") else open(sample_ids_path, "rt") as f: for i, line in enumerate(f): if i < start_with_sample_i: continue elif i >= start_with_sample_i + n_samples_to_process: break sample_id = line.rstrip("\n") sample_ids.append(sample_id) if i <= n_sample_ids_to_print: logging.info(sample_id) if i == n_sample_ids_to_print and n_sample_ids_to_print > 0: logging.info("...") logging.info(f"Parsed {len(sample_ids)} sample ids from {sample_ids_path}") return sample_ids
def main(df_x_path, df_y_path, output_path, python_image): backend = hb.ServiceBackend() b = hb.Batch(name='rf-loo', default_python_image=python_image) with hl.hadoop_open(df_y_path) as f: local_df_y = pd.read_table(f, header=0, index_col=0) df_x_input = b.read_input(df_x_path) df_y_input = b.read_input(df_y_path) results = [] for window in local_df_y.index.to_list(): checkpoint = checkpoint_path(window) if hl.hadoop_exists(checkpoint): result = b.read_input(checkpoint) results.append(result) continue j = b.new_python_job() result = j.call(random_forest, df_x_input, df_y_input, window) tsv_result = j.call(as_tsv, result) tsv_result = tsv_result.as_str() b.write_output(tsv_result, checkpoint) results.append(tsv_result) output = hb.concatenate(b, results) b.write_output(output, output_path) b.run(wait=False) backend.close()
def main(args): print("main") ht = hl.read_table( f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_table_for_RF_by_variant_type.ht' ) run_hash = str(uuid.uuid4())[:8] rf_runs = get_rf_runs(f'{tmp_dir}/ddd-elgh-ukbb/') while run_hash in rf_runs: run_hash = str(uuid.uuid4())[:8] ht_result, rf_model = train_rf(ht, args) print("Writing out ht_training data") ht_result = ht_result.checkpoint( f'{tmp_dir}/ddd-elgh-ukbb/Sanger_RF_training_data.ht', overwrite=True) rf_runs[run_hash] = get_run_data( vqsr_training=False, transmitted_singletons=True, test_intervals=args.test_intervals, adj=False, features_importance=hl.eval(ht_result.features_importance), test_results=hl.eval(ht_result.test_results), ) with hl.hadoop_open(f'{plot_dir}/ddd-elgh-ukbb/variant_qc/rf_runs.json', "w") as f: json.dump(rf_runs, f) logger.info("Saving RF model") save_model(rf_model, f'{tmp_dir}/ddd-elgh-ukbb/rf_model.model')
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(HGDP1KG_TOBWGS) # Perform kinship test with pc_relate pc_rel_path = output_path('pc_relate_kinship_estimate.ht') pc_rel = hl.pc_relate(mt.GT, 0.01, k=10, statistics='kin') pc_rel.write(pc_rel_path, overwrite=True) pairs = pc_rel.filter(pc_rel['kin'] >= 0.125) related_samples_to_remove = hl.maximal_independent_set( pairs.i, pairs.j, False) n_related_samples = related_samples_to_remove.count() print(f'related_samples_to_remove.count() = {n_related_samples}') # save as html html = pd.DataFrame({ 'removed_individual': related_samples_to_remove.node.s.collect() }).to_html() plot_filename_html = output_path(f'removed_samples.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html)
def print_ref_block_stats(path: str): import numpy as np def _print_block_stats(stats: hl.Struct): def get_quantile(cum_prop, quantile): return [i for i, x in enumerate(cum_prop) if x > quantile][0] for strat, ref_block_stats in [('all', stats.ref_block_stats), ('adj', stats.adj_ref_block_stats)]: n_blocks = np.sum( ref_block_stats.hist.bin_freq) + ref_block_stats.hist.n_larger cum_blocks = np.cumsum(ref_block_stats.hist.bin_freq) cum_prop = [x / n_blocks for x in cum_blocks] print(f"Stats for {strat}") print(f"Number of blocks: {n_blocks}") print(f"Largest block size: {ref_block_stats.stats.max}") print(f"95% block size: {get_quantile(cum_prop, 0.95)}") print(f"99% block size: {get_quantile(cum_prop, 0.99)}") print(f"99.9% block size: {get_quantile(cum_prop, 0.999)}") print(f"99.95% block size: {get_quantile(cum_prop, 0.9995)}") print( f"Percentage blocks below 10k: {1-(ref_block_stats.hist.n_larger/n_blocks)}" ) if path.startswith('gs://'): with hl.hadoop_open(path, 'rb') as f: _print_block_stats(pickle.load(f)) else: with open(path, 'rb') as f: _print_block_stats(pickle.load(f))
def get_rows_data(rows_files): file_sizes = [] partition_bounds = [] parts_file = [x['path'] for x in rows_files if x['path'].endswith('parts')] if parts_file: parts = hl.hadoop_ls(parts_file[0]) for i, x in enumerate(parts): index = x['path'].split(f'{parts_file[0]}/part-')[1].split('-')[0] if i < len(parts) - 1: test_index = parts[i + 1]['path'].split( f'{parts_file[0]}/part-')[1].split('-')[0] if test_index == index: continue file_sizes.append(x['size_bytes']) metadata_file = [ x['path'] for x in rows_files if x['path'].endswith('metadata.json.gz') ] if metadata_file: with hl.hadoop_open(metadata_file[0], 'rb') as f: rows_meta = json.loads(f.read()) try: partition_bounds = [(x['start']['locus']['contig'], x['start']['locus']['position'], x['end']['locus']['contig'], x['end']['locus']['position']) for x in rows_meta['jRangeBounds']] except KeyError: pass return partition_bounds, file_sizes
def test_export(self): t = hl.utils.range_table(1).annotate(foo = 3) tmp_file = new_temp_file() t.export(tmp_file) with hl.hadoop_open(tmp_file, 'r') as f_in: assert f_in.read() == 'idx\tfoo\n0\t3\n'
def test_export_delim(self): t = hl.utils.range_table(1).annotate(foo = 3) tmp_file = new_temp_file() t.export(tmp_file, delimiter=',') with hl.hadoop_open(tmp_file, 'r') as f_in: assert f_in.read() == 'idx,foo\n0,3\n'
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(HGDP1KG_TOBWGS) mt = mt.filter_cols( (mt.hgdp_1kg_metadata.population_inference.pop == 'nfe') | (mt.s.contains('TOB')) ) # Remove related samples (at the 2nd degree or closer) king = hl.king(mt.GT) king_path = output_path('king_kinship_estimate_NFE.ht') king.write(king_path) ht = king.entries() related_samples = ht.filter((ht.s_1 != ht.s) & (ht.phi > 0.125), keep=True) struct = hl.struct(i=related_samples.s_1, j=related_samples.s) struct = struct.annotate(phi=related_samples.phi) related_samples_to_remove = hl.maximal_independent_set( struct.i, struct.j, False # pylint: disable=E1101 ) n_related_samples = related_samples_to_remove.count() print(f'related_samples_to_remove.count() = {n_related_samples}') # save as html html = pd.DataFrame( {'related_individual': related_samples_to_remove.node.collect()} ).to_html() plot_filename_html = output_path(f'related_samples.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html)
def get_rows_data(rows_files): # noqa: D103 file_sizes = [] partition_bounds = [] parts_file = [x["path"] for x in rows_files if x["path"].endswith("parts")] if parts_file: parts = hl.hadoop_ls(parts_file[0]) for i, x in enumerate(parts): index = x["path"].split(f"{parts_file[0]}/part-")[1].split("-")[0] if i < len(parts) - 1: test_index = (parts[i + 1]["path"].split( f"{parts_file[0]}/part-")[1].split("-")[0]) if test_index == index: continue file_sizes.append(x["size_bytes"]) metadata_file = [ x["path"] for x in rows_files if x["path"].endswith("metadata.json.gz") ] if metadata_file: with hl.hadoop_open(metadata_file[0], "rb") as f: rows_meta = json.loads(f.read()) try: partition_bounds = [( x["start"]["locus"]["contig"], x["start"]["locus"]["position"], x["end"]["locus"]["contig"], x["end"]["locus"]["position"], ) for x in rows_meta["jRangeBounds"]] except KeyError: pass return partition_bounds, file_sizes
def get_cases_and_controls_from_log(log_format): """ 'gs://path/to/result_chr{chrom}_000000001.variant.log' """ cases = controls = -1 for chrom in range(10, 23): try: with hl.hadoop_open(log_format.format(chrom=chrom)) as f: for line in f: line = line.strip() if line.startswith('Analyzing'): fields = line.split() if len(fields) == 6: try: cases = int(fields[1]) controls = int(fields[4]) break except ValueError: logger.warn( f'Could not load number of cases or controls from {line}.' ) elif line.endswith('samples were used in fitting the NULL glmm model and are found in sample file') or \ line.endswith('samples have been used to fit the glmm null model'): # This is ahead of the case/control count line ("Analyzing ...") above so this should be ok fields = line.split() try: cases = int(fields[0]) except ValueError: logger.warn( f'Could not load number of cases or controls from {line}.' ) return cases, controls except: pass return cases, controls
def infer_ped(related_data: GnomADRelatedData) -> None: """ Infers trios based on `pc_relate` kinship output. Writes a CSV containing one row per trio. If there are duplicate samples, each combination of duplicate samples will be present in the output. :param GnomADRelatedData related_data: Input data for inference :return: Nothing :rtype: None """ logger.info(f"Inferring pedigree for {related_data.data_type}") sex = {row.s: row.is_female for row in related_data.meta_pd.itertuples()} dups_to_remove = {s for d in related_data.dups for s in list(d)[1:]} raw_ped = infer_families(related_data.kin_ht, sex, dups_to_remove) logger.info( f"Found {len(raw_ped.complete_trios())} complete trios in {related_data.data_type}." ) # Create dataframe with all combinations of dups dup_trios_pd = get_dup_trios(raw_ped, related_data.sample_to_dups) logger.info( f"Found {len(dup_trios_pd)} trios combinations with dups in {related_data.data_type}." ) with hl.hadoop_open(dup_pedigree_tsv_path(related_data.data_type), 'w') as out: dup_trios_pd.to_csv(out, sep="\t", index=False)
def apply_mito_artifact_filter( mt: hl.MatrixTable, artifact_prone_sites_path: str, ) -> hl.MatrixTable: """Add back in artifact_prone_site filter :param hl.MatrixTable mt: MatrixTable to use an input :param str artifact_prone_sites_path: path to BED file of artifact_prone_sites to flag in the filters column :return: MatrixTable with artifact_prone_sites filter :rtype: hl.MatrixTable """ # apply "artifact_prone_site" filter to any SNP or deletion that spans a known problematic site mt = mt.annotate_rows( position_range=hl.range(mt.locus.position, mt.locus.position + hl.len(mt.alleles[0]))) artifact_sites = [] with hl.hadoop_open(artifact_prone_sites_path) as f: for line in f: pos = line.split()[2] artifact_sites.append(int(pos)) sites = hl.literal(set(artifact_sites)) mt = mt.annotate_rows(filters=hl.if_else( hl.len(hl.set(mt.position_range).intersection(sites)) > 0, {"artifact_prone_site"}, {"PASS"}, )) mt = mt.drop("position_range") return mt
def load_id_file(path): ids = [] with hl.hadoop_open(path) as f: for l in f: r = l.strip().split('\t') self.assertEqual(len(r), 2) ids.append(r[1]) return ids
def main(args): print("main") ht = hl.read_table( f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_table_for_RF_by_variant_type.ht' ) if args.train_rf: run_hash = str(uuid.uuid4())[:8] rf_runs = get_rf_runs(f'{tmp_dir}/ddd-elgh-ukbb/') while run_hash in rf_runs: run_hash = str(uuid.uuid4())[:8] ht_result, rf_model = train_rf(ht, args) print("Writing out ht_training data") ht_result = ht_result.checkpoint(get_rf(data="training", run_hash=run_hash).path, overwrite=True) # f'{tmp_dir}/ddd-elgh-ukbb/Sanger_RF_training_data.ht', overwrite=True) rf_runs[run_hash] = get_run_data( vqsr_training=False, transmitted_singletons=True, test_intervals=args.test_intervals, adj=False, features_importance=hl.eval(ht_result.features_importance), test_results=hl.eval(ht_result.test_results), ) with hl.hadoop_open( f'{plot_dir}/ddd-elgh-ukbb/variant_qc/rf_runs.json', "w") as f: json.dump(rf_runs, f) logger.info("Saving RF model") save_model(rf_model, get_rf(data="model", run_hash=run_hash), overwrite=True) # f'{tmp_dir}/ddd-elgh-ukbb/rf_model.model') else: run_hash = args.run_hash if args.apply_rf: logger.info(f"Applying RF model {run_hash}...") rf_model = load_model(get_rf(data="model", run_hash=run_hash)) ht = get_rf(data="training", run_hash=run_hash).ht() features = hl.eval(ht.features) ht = apply_rf_model(ht, rf_model, features, label=LABEL_COL) logger.info("Finished applying RF model") ht = ht.annotate_globals(rf_hash=run_hash) ht = ht.checkpoint( get_rf("rf_result", run_hash=run_hash).path, overwrite=True, ) ht_summary = ht.group_by("tp", "fp", TRAIN_COL, LABEL_COL, PREDICTION_COL).aggregate(n=hl.agg.count()) ht_summary.show(n=20)
def check_vcf_existence(participant_data: str, vcf_col: str, sample_map: str, output_bucket: str) -> Dict[str, str]: """For each participant specified in sample_map, checks that the vcf file exists, and if so, add the sample and vcf path to a dictionary :param str participant_data: participant data (downloaded data tab from terra) :param str vcf_col: name of column that contains vcf output :param str sample_map: path to file of samples to subset (tab-delimited participant_id and sample) :param str output_bucket: path to bucket to which results should be written :return: dictionary of samples for which the vcf existence was confirmed (sample as key, path to vcf as value) :rtype: Dict[str, str] """ # create file that will contain the samples with confirmed vcfs and their paths out_vcf = hl.hadoop_open(f"{output_bucket}/vcfs_to_combine.list", "w") # create participants_of_interest dictionary which will contain samples to which the results shoudl be subset participants_of_interest = {} confirmed_vcfs = {} with hl.hadoop_open(sample_map, "r") as f: next(f) for line in f: line = line.rstrip() items = line.split("\t") participant, sample = items[0:2] participants_of_interest[participant] = 0 # load in data from terra participant_info = hl.import_table(participant_data) df = participant_info.to_pandas() # check if the sample is in participants_of_interest, check that the vcf exists, and if yes to both, add to confirmed_vcfs dictionary for _, row in df.iterrows(): participant_id = row["entity:participant_id"] sample = row["s"] vcf = row[vcf_col] if participant_id in participants_of_interest and vcf != "": if hl.hadoop_is_file(vcf): out_vcf.write(f"{sample}\t{vcf}\n") confirmed_vcfs[sample] = vcf out_vcf.close() return confirmed_vcfs
def load_rel(ns, path): rel = np.zeros((ns, ns)) with hl.hadoop_open(path) as f: for i, l in enumerate(f): for j, n in enumerate(map(float, l.strip().split('\t'))): rel[i, j] = n self.assertEqual(j, i) self.assertEqual(i, ns - 1) return rel
def run(self): mt = self.import_mt() row_table = SeqrVariantsAndGenotypesSchema.elasticsearch_row(mt) self.export_table_to_elasticsearch(row_table, self._mt_num_shards(mt)) with hl.hadoop_open(self.completed_marker_path, "w") as f: f.write(".") self.cleanup()
def plot_correlation_matrices(chr_list): """ Plot combined correlation matrices for genotype-correlation and sumstats-correlation matrices """ for ch in chr_list: ss_ch = BlockMatrix.read('gs://nbaya/sumstats_corr/' + variant_set + '_ss_correlation_chr{}.bm/'.format(ch)) gt_ch = BlockMatrix.read('gs://nbaya/sumstats_corr/' + variant_set + '_gt_correlation_chr{}.bm/'.format(ch)) M_max = int( 1e4 ) #max number of variants to be taken from the block matrices (suggested: 2e4) M = ss_ch.shape[0] #dimension of block matrix # for idx in range(int(M/M_max)+1): #index of which disjoint window we are looking at in the block matrix for idx in range( 0, int(M / M_max) + 1 ): #index of which disjoint window we are looking at in the block matrix M0 = M_max * (idx) #start variant index for block matrix filtering M1 = min(M_max * (idx + 1), M) #stop variant index for block matrix filtering ss_np = ss_ch[M0:M1, M0:M1].to_numpy() gt_np = gt_ch[M0:M1, M0:M1].to_numpy() print('\nStarting variant window: [' + str(M0) + ',' + str(M1) + ']') w = int( 5e3 ) #window width of variants for correlation matrix (suggested: 2e3) for i in range(int((M1 - M0 - 1) / w) + 1): w0 = w * i #start variant index for window of correlation matrix w1 = min( w * (i + 1), M1 - M0) #stop variant index for window of correlation matrix full = (ss_np[w0:w1, w0:w1] + gt_np[w0:w1, w0:w1].T) np.fill_diagonal(full, 1) fig, ax = plt.subplots() ax.imshow(full, cmap='bwr') ax.plot([0, w], [0, w], 'k--', alpha=0.5, lw=2) plt.xlim([0, w]) plt.ylim([w, 0]) ax.text(w * 0.83, w * 0.1, "SS", fontsize=60, alpha=0.5) ax.text(w * 0.02, w * 0.97, "GT", fontsize=60, alpha=0.5) plt.title('chr' + str(ch) + ' ' + variant_set + ' variants (' + str(M0 + w0) + '-' + str(M0 + w1) + ')') fig = plt.gcf() fig.set_size_inches(10, 10) path = ('gs://nbaya/sumstats_corr/plots/chr' + str(ch) + '_' + variant_set + '_' + str(M0 + w0).zfill(len(str(M))) + '-' + str(M0 + w1).zfill(len(str(M))) + '.png') with hl.hadoop_open(path, 'wb') as f: fig.savefig(f, dpi=600) plt.close() print('\nFinished variant window: [' + str(M0) + ',' + str(M1) + ']')
def get_sample_names_from_list_of_files(input_files, output_fname): sample_names_dict = hl.grep('#CHROM', input_files, max_count=100000, show=False) sample_names = [] for fname, lines in sample_names_dict.items(): sample_names.append('\t'.join( [lines[0].strip().split('\t')[-1], fname])) with hl.hadoop_open(output_fname, 'w') as f: f.write('\n'.join(sample_names))
def parse_sample_mapping(sample_map_path: str) -> Tuple[List[str], List[str]]: sample_names: List[str] = list() sample_paths: List[str] = list() with hl.hadoop_open(sample_map_path) as f: for line in f: [name, path] = line.strip().split('\t') sample_names.append(name) sample_paths.append(path) return sample_names, sample_paths
def get_inverse_normalize_status(null_glmm_log): status = 'Unknown' with hl.hadoop_open(null_glmm_log) as f: for line in f: if line.startswith('$invNormalize'): try: status = f.readline().strip().split()[1] except: logger.warning( f'Could not load inv_norm status from {line} in {null_glmm_log}.' ) return status.capitalize()
def read(cls, path): """Load reference genome from a JSON file. Notes ----- The JSON file must have the following format: .. code-block:: text {"name": "my_reference_genome", "contigs": [{"name": "1", "length": 10000000}, {"name": "2", "length": 20000000}, {"name": "X", "length": 19856300}, {"name": "Y", "length": 78140000}, {"name": "MT", "length": 532}], "xContigs": ["X"], "yContigs": ["Y"], "mtContigs": ["MT"], "par": [{"start": {"contig": "X","position": 60001},"end": {"contig": "X","position": 2699521}}, {"start": {"contig": "Y","position": 10001},"end": {"contig": "Y","position": 2649521}}] } `name` must be unique and not overlap with Hail's pre-instantiated references: ``'GRCh37'``, ``'GRCh38'``, ``'GRCm38'``, and ``'default'``. The contig names in `xContigs`, `yContigs`, and `mtContigs` must be present in `contigs`. The intervals listed in `par` must have contigs in either `xContigs` or `yContigs` and must have positions between 0 and the contig length given in `contigs`. Parameters ---------- path : :obj:`str` Path to JSON file. Returns ------- :class:`.ReferenceGenome` """ with hl.hadoop_open(path) as f: return ReferenceGenome._from_config(json.load(f))
def load_dataset(name, version, reference_genome, config_file='gs://hail-datasets/datasets.json'): """Load a genetic dataset from Hail's repository. Example ------- >>> # Load 1000 Genomes MatrixTable with GRCh38 coordinates >>> mt_1kg = hl.experimental.load_dataset(name='1000_genomes', # doctest: +SKIP ... version='phase3', ... reference_genome='GRCh38') Parameters ---------- name : :obj:`str` Name of the dataset to load. version : :obj:`str` Version of the named dataset to load (see available versions in documentation). reference_genome : `GRCh37` or `GRCh38` Reference genome build. Returns ------- :class:`.Table` or :class:`.MatrixTable`""" with hl.hadoop_open(config_file, 'r') as f: datasets = json.load(f) names = set([dataset['name'] for dataset in datasets]) if name not in names: raise ValueError('{} is not a dataset available in the repository.'.format(repr(name))) versions = set([dataset['version'] for dataset in datasets if dataset['name']==name]) if version not in versions: raise ValueError("""Version {0} not available for dataset {1}. Available versions: {{{2}}}.""".format(repr(version), repr(name), repr('","'.join(versions)))) reference_genomes = set([dataset['reference_genome'] for dataset in datasets if dataset['name']==name]) if reference_genome not in reference_genomes: raise ValueError("""Reference genome build {0} not available for dataset {1}. Available reference genome builds: {{'{2}'}}.""".format(repr(reference_genome), repr(name), '\',\''.join((reference_genomes)))) path = [dataset['path'] for dataset in datasets if all([dataset['name']==name, dataset['version']==version, dataset['reference_genome']==reference_genome])][0].strip('/') if path.endswith('.ht'): dataset = hl.read_table(path) else: if not path.endswith('.mt'): raise ValueError('Invalid path {}: can only load datasets with .ht or .mt extensions.'.format(repr(path))) dataset = hl.read_matrix_table(path) return dataset