def test_hadoop_is_file(self): a_file = f'{BUCKET}/test_hadoop_is_file.txt' with hadoop_open(a_file, 'w') as f: f.write("HELLO WORLD") self.assertTrue(hl.hadoop_is_file(a_file)) self.assertFalse(hl.hadoop_is_file(f'{BUCKET}/')) self.assertFalse(hl.hadoop_is_file(f'{BUCKET}/invalid-path'))
def main(): p = batch_utils.init_arg_parser(default_cpu=0.5, default_memory=1.75, gsa_key_file=os.path.expanduser("~/.config/gcloud/misc-270914-cb9992ec9b25.json")) p.add_argument("tsv_path", help="Table with header: sample_id, cram_path, crai_path") p.add_argument("sample_id", nargs="*", help="(optional) 1 or more sample_ids to process. If not specified, all rows in the .tsv will be processed.") args = p.parse_args() df = pd.read_table(args.tsv_path) if {"sample_id", "cram_path", "crai_path"} - set(df.columns): p.error(f"{args.tsv_path} must contain a 'sample_id', 'cram_path', 'crai_path' columns") if not args.force: hl.init(log="/dev/null", quiet=True) # process samples with batch_utils.run_batch(args, batch_name=f"extract chrM") as batch: for _, row in df.iterrows(): if args.sample_id and row.sample_id not in set(args.sample_id): continue input_filename = os.path.basename(row.cram_path) prefix = input_filename.replace(".bam", "").replace(".cram", "") output_cram_path = os.path.join(OUTPUT_DIR, f"{prefix}.chrM.cram") output_crai_path = os.path.join(OUTPUT_DIR, f"{prefix}.chrM.cram.crai") if not args.force and hl.hadoop_is_file(output_cram_path) and hl.hadoop_is_file(output_crai_path): logger.info(f"Output files exist (eg. {output_cram_path}). Skipping {input_filename}...") continue j = batch_utils.init_job(batch, f"chrM: {row.sample_id}", DOCKER_IMAGE if not args.raw else None, args.cpu, args.memory) batch_utils.switch_gcloud_auth_to_user_account(j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT, GCLOUD_PROJECT) # copy inputs REF_PATHS = batch_utils.HG38_REF_PATHS fasta_filename = os.path.basename(parse.urlparse(REF_PATHS.fasta).path) j.command(f"""set -ex env gsutil -m cp {REF_PATHS.fasta} {REF_PATHS.fai} {REF_PATHS.dict} . java -Xms2g -jar /gatk.jar PrintReads \ -R {fasta_filename} \ -I {row.cram_path} \ --read-index {row.crai_path} \ -L chrM \ --gcs-project-for-requester-pays broad-mpg-gnomad \ -O {prefix}.chrM.bam samtools view -C -T {fasta_filename} {prefix}.chrM.bam > {prefix}.chrM.cram samtools index {prefix}.chrM.cram {prefix}.chrM.cram.crai gsutil -m cp {prefix}.chrM.cram.crai {output_crai_path} gsutil -m cp {prefix}.chrM.cram {output_cram_path} """) logger.info(f"Submitted {row.sample_id}: {output_cram_path}")
def test_hadoop_is_file(self, prefix: Optional[str] = None): if prefix is None: prefix = self.remote_tmpdir a_file = f'{prefix}/test_hadoop_is_file.txt' with hadoop_open(a_file, 'w') as f: f.write("HELLO WORLD") self.assertTrue(hl.hadoop_is_file(a_file)) self.assertFalse(hl.hadoop_is_file(f'{prefix}/')) self.assertFalse(hl.hadoop_is_file(f'{prefix}/invalid-path'))
def test_hadoop_is_file(self, bucket=None): if bucket is None: bucket = self.remote_bucket a_file = f'{bucket}/test_hadoop_is_file.txt' with hadoop_open(a_file, 'w') as f: f.write("HELLO WORLD") self.assertTrue(hl.hadoop_is_file(a_file)) self.assertFalse(hl.hadoop_is_file(f'{bucket}/')) self.assertFalse(hl.hadoop_is_file(f'{bucket}/invalid-path'))
def join_clump_hts(pop, not_pop, max_pops, high_quality=False, overwrite=False): r''' Wrapper for mwzj_hts_by_tree() ''' assert not (not_pop and max_pops), '`not_pop` and `max_pops` cannot both be True' mt_path = get_clumping_results_path(pop=pop, not_pop=not_pop, max_pops=max_pops, high_quality=high_quality) if hl.hadoop_is_file(f'{mt_path}/_SUCCESS') and ~overwrite: print(f'\nMT already written to {mt_path}! To overwrite, use overwrite=True') return else: print(f'Writing MT to {mt_path}') pop = pop.upper() if pop is not None else None clump_results_dir = (f'{ldprune_dir}/results{"_high_quality" if high_quality else ""}/'+ ('max_pops' if max_pops else '{"not_" if not_pop else ""}{pop}')) ls = hl.hadoop_ls(f'{clump_results_dir}/*') all_hts = [x['path'] for x in ls if 'clump_results.ht' in x['path']] temp_dir = ('gs://ukbb-diverse-temp-30day/nb-temp/'+ 'max_pops' if max_pops else f'{"not_" if not_pop else ""}{pop}'+ f'{"-hq" if high_quality else ""}') globals_for_col_key = ukb_common.PHENO_KEY_FIELDS mt = mwzj_hts_by_tree(all_hts=all_hts, temp_dir=temp_dir, globals_for_col_key=globals_for_col_key) # mt = resume_mwzj(temp_dir=temp_dir, # NOTE: only use if all the temp hts have been created # globals_for_col_key=globals_for_col_key) mt.write(mt_path, overwrite=overwrite)
def handle(self, *args, **options): samples = (IgvSample.objects.filter( individual__family__project__name__in=args ) if args else IgvSample.objects.all()).filter( file_path__startswith='gs://' ).prefetch_related('individual', 'individual__family__project') missing_counter = collections.defaultdict(int) guids_of_samples_with_missing_file = set() for sample in tqdm.tqdm(samples, unit=" samples"): if not hl.hadoop_is_file(sample.file_path): individual_id = sample.individual.individual_id project = sample.individual.family.project.name missing_counter[project] += 1 logger.info('Individual: {} file not found: {}'.format(individual_id, sample.file_path)) if not options.get('dry_run'): guids_of_samples_with_missing_file.add(sample.guid) if len(guids_of_samples_with_missing_file) > 0: IgvSample.bulk_update(user=None, update_json={'file_path': ''}, guid__in=guids_of_samples_with_missing_file) logger.info('---- DONE ----') logger.info('Checked {} samples'.format(len(samples))) if missing_counter: logger.info('{} files not found:'.format(sum(missing_counter.values()))) for project_name, c in sorted(missing_counter.items(), key=lambda t: -t[1]): logger.info(' {} in {}'.format(c, project_name))
def create_full_results_file(prune, overwrite=False): r''' Concatenates PRS-phentype regression results into a single table. ''' reg_path_regex = prs_dir + f'prs_phen_reg.*.*.n_remove_{int(n_remove_per_sex)}.seed_*.{"" if prune else "not_"}pruned*.tsv' ls = hl.hadoop_ls(reg_path_regex) reg_paths = sorted([f['path'] for f in ls]) df_list = [] for reg_path in reg_paths: with hl.hadoop_open(reg_path) as f: df_list.append(pd.read_csv(f, sep='\t')) df = pd.concat(df_list, sort=False) df.insert(1, 'phen_desc', df.phen.astype(str).apply(lambda x: phen_dict[x][0]) ) # add phenotype description to dataframe all_reg_results_path = prs_dir + f'prs_phen_reg.all_phens.n_remove_{int(n_remove_per_sex)}.{"" if prune else "not_"}pruned.tsv' if hl.hadoop_is_file(all_reg_results_path) and not overwrite: print('\n... Full PRS-phen regression results already written! ...') print(all_reg_results_path) else: print('\n... Writing PRS-phen regression results ...') print(all_reg_results_path) with hl.hadoop_open(all_reg_results_path, 'w') as f: df.to_csv(f, sep='\t', index=False)
def get_test_genotypes_mt(chrom, genotype_samples_ht_path, genotype_mt_path, cases_only): meta_mt = hl.read_matrix_table(get_meta_analysis_results_path()) # if chrom == 'all': mt = get_filtered_mt_with_x() else: mt = get_filtered_mt(chrom=chrom, entry_fields=('dosage', )) if status == 'cases': t2d_ht = hl.read_table( f'gs://ukbb-diverse-temp-30day/nb-scratch/t2d.ht/') t2d_ht = t2d_ht.filter(t2d_ht.both_sexes == 1) t2d_ht = t2d_ht.key_by('userId') mt = mt.filter_cols(hl.is_defined(t2d_ht[hl.int32(mt.s)])) mt = mt.filter_rows(hl.is_defined(meta_mt.rows()[mt.row_key])) if not hl.hadoop_is_file(f'{genotype_samples_ht_path}/_SUCCESS'): samples = mt.s.take(10) mt = mt.filter_cols(hl.literal(samples).contains(mt.s)) mt = mt.key_cols_by(userId=hl.int32(mt.s)) mt.cols().add_index().write(genotype_samples_ht_path, overwrite=True) else: samples_ht = hl.read_table(genotype_samples_ht_path) samples = samples_ht.s.collect() mt = mt.filter_cols(hl.literal(samples).contains(mt.s)) mt = mt.key_cols_by(userId=hl.int32(mt.s)) mt = mt.select_entries('dosage') mt = mt.select_rows() mt = mt.select_cols() mt = mt.repartition(10) mt.write(genotype_mt_path)
def add_seqr_sample_to_locals3(sample: SeqrSample): parts = parse_vcf_s3_path(sample.path_to_vcf) local_filename = "vcfs/" + str(sample.project) + "/" + parts['filename'] if not hl.hadoop_is_file("hdfs:///user/hdfs/" + local_filename): os.system('aws s3 cp ' + sample.path_to_vcf + ' .') os.system('hdfs dfs -put ' + parts['filename'] + ' ' + local_filename) os.system('rm ' + parts['filename']) return local_filename
def main(args): hl.init(log='/tmp/hail.log') n_max = 5000 # maximum number of samples in subset (equal to final sample size if there are sufficient samples for each population) subsets_dir = f'{bucket}/ld_prune/subsets_{round(n_max/1e3)}k' pops_list = get_pops_list(args) print(f'overwrite_plink: {args.overwrite_plink}') for pops in pops_list: pops_str = '-'.join(pops) ht_sample_path = f'{subsets_dir}/{pops_str}/{pops_str}.ht' bfile_prefix = f'{subsets_dir}/{pops_str}/{pops_str}' master_bfile_paths = [ f'{bfile_prefix}.{suffix}' for suffix in ['bed', 'bim', 'fam'] ] if not args.overwrite_plink and all( map(hl.hadoop_is_file, [f'{ht_sample_path}/_SUCCESS'] + master_bfile_paths)): continue else: print(f'\n... Starting PLINK exports for {pops_str} ...') mt_pop = get_mt_filtered_by_pops( pops=pops, chrom='all', # chrom='all' includes autosomes and chrX entry_fields=('GT', ) ) # default entry_fields will be 'GP', we need 'GT' for exporting to PLINK if hl.hadoop_is_file(f'{ht_sample_path}/_SUCCESS'): ht_sample = hl.read_table(ht_sample_path) ht_sample_ct = ht_sample.count() print(f'... Subset ht already exists for pops={pops_str} ...') print(f'\nSubset ht sample ct: {ht_sample_ct}\n\n') else: print(f'\n\n... Getting sample subset ({pops_str}) ...\n') ht_sample = get_subset(mt_pop=mt_pop, pop_dict=pop_dict, pops=pops, n_max=n_max) ht_sample_ct = ht_sample.count() print(f'\n\nht_sample_ct: {ht_sample_ct}\n\n') ht_sample = ht_sample.checkpoint(ht_sample_path) print(f'... Exporting to PLINK ({pops_str}) ...') to_plink(pops=pops, subsets_dir=subsets_dir, mt=mt_pop, ht_sample=ht_sample, bfile_path=bfile_prefix, overwrite=args.overwrite_plink)
def remap_samples( original_mt_path: str, input_mt: hl.MatrixTable, pedigree: hl.Table, inferred_sex: str, ) -> Tuple[hl.MatrixTable, hl.Table]: """ Rename `s` col in the MatrixTable and inferred sex ht. :param original_mt_path: Path to original MatrixTable location :param input_mt: MatrixTable :param pedigree: Pedigree file from seqr loaded as a Hail Table :param inferred_sex: Path to text file of inferred sexes :return: mt and sex ht with sample names remapped """ base_path = "/".join( dirname(original_mt_path).split("/")[:-1]) + ("/base/projects") project_list = list(set(pedigree.Project_GUID.collect())) # Get the list of hts containing sample remapping information for each project remap_hts = [] logger.info("Found %d projects that need to be remapped.", len(remap_hts)) sex_ht = hl.import_table(inferred_sex) for i in project_list: remap = f"{base_path}/{i}/{i}_remap.tsv" if hl.hadoop_is_file(remap): remap_ht = hl.import_table(remap) remap_ht = remap_ht.key_by("s", "seqr_id") remap_hts.append(remap_ht) if len(remap_hts) > 0: ht = remap_hts[0] for next_ht in remap_hts[1:]: ht = ht.join(next_ht, how="outer") # If a sample has a non-missing value for seqr_id, rename it to the sample name for the mt and sex ht ht = ht.key_by("s") input_mt = input_mt.annotate_cols(seqr_id=ht[input_mt.s].seqr_id) input_mt = input_mt.key_cols_by(s=hl.if_else( hl.is_missing(input_mt.seqr_id), input_mt.s, input_mt.seqr_id)) sex_ht = sex_ht.annotate(seqr_id=ht[sex_ht.s].seqr_id).key_by("s") sex_ht = sex_ht.key_by(s=hl.if_else(hl.is_missing(sex_ht.seqr_id), sex_ht.s, sex_ht.seqr_id)) else: sex_ht = sex_ht.key_by("s") return input_mt, sex_ht
def run_gwas(mt, phen: str, sim_name: str, subset_idx: int, param_suffix: str, wd: str, is_logreg=True): assert {'GT', 'dosage'}.intersection( mt.entry ) != {}, "mt does not have an entry field named 'dosage' or 'GT' corresponding to genotype data" mt = mt.filter_cols(mt.subset_idx == subset_idx) mt = mt.filter_cols(hl.is_defined(mt[phen])) print( f'\n\ngwas sample count (subset {subset_idx}): {mt.count_cols()}\n\n') if 'dosage' in mt.entry: mt = mt.annotate_rows(EAF=hl.agg.mean(mt.dosage) / 2) elif 'GT' in mt.entry: mt = mt.annotate_rows(EAF=hl.agg.mean(mt.GT.n_alt_alleles()) / 2) gwas_path = f'{wd}/gwas.{"logreg" if is_logreg else "linreg"}.{sim_name}.subset_{subset_idx}.{param_suffix}.tsv.gz' if not hl.hadoop_is_file(gwas_path): gt_field = mt.dosage if 'dosage' in mt.entry else mt.GT.n_alt_alleles() if is_logreg: gwas_ht = hl.logistic_regression_rows(test='wald', y=mt[phen], x=gt_field, covariates=[1], pass_through=['EAF']) else: gwas_ht = hl.linear_regression_rows(y=mt[phen], x=gt_field, covariates=[1], pass_through=['EAF']) gwas_ht.select('EAF', 'beta', 'standard_error', 'p_value').export(gwas_path) else: print(f'GWAS already run! ({gwas_path})') gwas_ht = hl.import_table(gwas_path, impute=True, force=True) gwas_ht = gwas_ht.annotate(locus=hl.parse_locus(gwas_ht.locus), alleles=gwas_ht.alleles.replace( '\[\"', '').replace('\"\]', '').split('\",\"')) gwas_ht = gwas_ht.key_by('locus', 'alleles') return gwas_ht
def get_adj_betas(p, pop, not_pop, max_pops, pheno_key_dict, pheno_id, high_quality, hail_script): r''' Wrapper method for both PLINK clumping and SBayesR ''' output_dir = ( f'{ldprune_dir}/results{"_high_quality" if high_quality else ""}/' + ('max_pops' if max_pops else f'{"not_" if not_pop else ""}{pop}') + f'/{pheno_id}') clump_output_txt = f'{output_dir}/clump_results.txt' # PLINK clump output txt file clump_output_ht = f'{output_dir}/clump_results.ht' # PLINK clump output hail table # sbayesr_output_txt = f'{output_dir}/sbayesr_results-test.txt' # SBayesR output txt file # sbayesr_output_ht = f'{output_dir}/sbayesr_results-test.ht' # SBayesR output hail table overwrite = False clump_file_exists = hl.hadoop_is_file(f'{clump_output_ht}/_SUCCESS') if not clump_file_exists or overwrite: if clump_file_exists and overwrite: print( '\n\nWARNING: Existing results will be overwritten for {pheno_id} in {output_dir}!\n' ) ss_dict = get_sumstats(p=p, pop=pop, not_pop=not_pop, max_pops=max_pops, pops=pheno_key_dict['pops'], high_quality=high_quality, pheno_id=pheno_id, method='clump') run_method(p=p, pop=pop, not_pop=not_pop, max_pops=max_pops, pheno_id=pheno_id, pheno_key_dict=pheno_key_dict, hail_script=hail_script, output_txt=clump_output_txt, output_ht=clump_output_ht, ss_dict=ss_dict, method='clump') else: print( f'\n\nSkipping {pheno_id} because results ht exists and overwrite=False\n' )
def check_vcf_existence(participant_data: str, vcf_col: str, sample_map: str, output_bucket: str) -> Dict[str, str]: """For each participant specified in sample_map, checks that the vcf file exists, and if so, add the sample and vcf path to a dictionary :param str participant_data: participant data (downloaded data tab from terra) :param str vcf_col: name of column that contains vcf output :param str sample_map: path to file of samples to subset (tab-delimited participant_id and sample) :param str output_bucket: path to bucket to which results should be written :return: dictionary of samples for which the vcf existence was confirmed (sample as key, path to vcf as value) :rtype: Dict[str, str] """ # create file that will contain the samples with confirmed vcfs and their paths out_vcf = hl.hadoop_open(f"{output_bucket}/vcfs_to_combine.list", "w") # create participants_of_interest dictionary which will contain samples to which the results shoudl be subset participants_of_interest = {} confirmed_vcfs = {} with hl.hadoop_open(sample_map, "r") as f: next(f) for line in f: line = line.rstrip() items = line.split("\t") participant, sample = items[0:2] participants_of_interest[participant] = 0 # load in data from terra participant_info = hl.import_table(participant_data) df = participant_info.to_pandas() # check if the sample is in participants_of_interest, check that the vcf exists, and if yes to both, add to confirmed_vcfs dictionary for _, row in df.iterrows(): participant_id = row["entity:participant_id"] sample = row["s"] vcf = row[vcf_col] if participant_id in participants_of_interest and vcf != "": if hl.hadoop_is_file(vcf): out_vcf.write(f"{sample}\t{vcf}\n") confirmed_vcfs[sample] = vcf out_vcf.close() return confirmed_vcfs
def combine_all_dbs_for_chrom(args, batch, output_filename_prefix, chrom_to_combined_db_paths, chrom_to_combine_db_jobs, temp_dir="./temp"): for chrom, combined_db_paths in chrom_to_combined_db_paths.items(): output_filename = f"all_variants_{output_filename_prefix}.chr{chrom}.db" combine_db_jobs = chrom_to_combine_db_jobs[chrom] if not args.force and hl.hadoop_is_file( f"{args.output_dir}/{output_filename}"): logger.info(f"{output_filename} already exists. Skipping...") continue cpu = 2 j3 = batch_utils.init_job( batch, f"combine all dbs (cpu: {cpu}): {output_filename}", DOCKER_IMAGE if not args.raw else None, cpu=cpu, disk_size=cpu * 21) batch_utils.switch_gcloud_auth_to_user_account( j3, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT) # don't use batch_utils.localize here because the command becomes too large j3.command("gsutil -m cp " + " ".join(combined_db_paths) + " .") local_input_db_paths = [os.path.basename(p) for p in combined_db_paths] add_command_to_combine_dbs(j3, output_filename, local_input_db_paths, select_chrom=None, set_combined_bamout_id=None, create_index=True, temp_dir=temp_dir) j3.command(f"gsutil -m cp {output_filename} {args.output_dir}/") for j2 in combine_db_jobs: j3.depends_on(j2)
def get_mt(overwrite=False): mt_path = 'gs://ukbb-temp-30day/nbaya/ukb31063.hm3_variants.gwas_samples.mt' if hl.hadoop_is_file(f'{mt_path}/_SUCCESS'): mt = hl.read_matrix_table(mt_path) else: tmp_mt_path = mt_path.replace('nbaya/', 'nbaya/tmp-') def _write_tmp_mt(): ## part 1: about an 1 hr with 30 workers (possibly starting with 10 then increasing to 30 if progress stalls) variants = hl.import_table( 'gs://nbaya/hapmap3_variants.tsv.gz', force=True ) # download here: https://github.com/nikbaya/split/blob/master/hapmap3_variants.tsv.gz variants = variants.key_by(**hl.parse_variant(variants.v)) mt = get_ukb_imputed_data( 'all', variant_list=variants, entry_fields=('dosage', )) # 'all' = autosomes only # print(mt.count()) # (1089172, 487409) # mt = mt.checkpoint(mt_path.replace('nbaya/','nbaya/tmp-'), overwrite=overwrite) mt.write(tmp_mt_path, _read_if_exists=True) def _repartition(): ## part 2: 5 min with 100 preemptibles mt = hl.read_matrix_table(tmp_mt_path) mt = mt.repartition(1000) withdrawn = hl.read_table( 'gs://ukb31063/ukb31063.withdrawn_samples.ht') mt = mt.anti_join_cols(withdrawn) # print(mt.count()) # (1089172, 487409) covs = hl.read_table( 'gs://ukb31063/ukb31063.neale_gwas_covariates.both_sexes.ht') mt = mt.annotate_cols(**covs[mt.s]) mt = mt.filter_cols(hl.is_defined(mt.PC1)) # print(mt.count()) # (1089172, 361144) return mt.checkpoint(mt_path, overwrite=overwrite) _write_tmp_mt() mt = _repartition() return mt
def handle(self, *args, **options): samples = (IgvSample.objects.filter( individual__family__project__name__in=args ) if args else IgvSample.objects.all()).filter( file_path__startswith='gs://' ).prefetch_related('individual', 'individual__family__project') missing_counter = collections.defaultdict(int) guids_of_samples_with_missing_file = set() project_name_to_missing_paths = collections.defaultdict(list) for sample in tqdm.tqdm(samples, unit=" samples"): if not hl.hadoop_is_file(sample.file_path): individual_id = sample.individual.individual_id project_name = sample.individual.family.project.name missing_counter[project_name] += 1 project_name_to_missing_paths[project_name].append((individual_id, sample.file_path)) logger.info('Individual: {} file not found: {}'.format(individual_id, sample.file_path)) if not options.get('dry_run'): guids_of_samples_with_missing_file.add(sample.guid) if len(guids_of_samples_with_missing_file) > 0: IgvSample.bulk_update(user=None, update_json={'file_path': ''}, guid__in=guids_of_samples_with_missing_file) logger.info('---- DONE ----') logger.info('Checked {} samples'.format(len(samples))) if missing_counter: logger.info('{} files not found:'.format(sum(missing_counter.values()))) for project_name, c in sorted(missing_counter.items(), key=lambda t: -t[1]): logger.info(' {} in {}'.format(c, project_name)) # post to slack if not options.get('dry_run'): slack_message = 'Found {} broken bam/cram path(s)\n'.format(sum(missing_counter.values())) for project_name, missing_paths_list in project_name_to_missing_paths.items(): slack_message += "\nIn project {}:\n".format(project_name) slack_message += "\n".join([ " {} {}".format(individual_id, path) for individual_id, path in missing_paths_list ]) communication_utils.safe_post_to_slack(SEQR_SLACK_DATA_ALERTS_NOTIFICATION_CHANNEL, slack_message)
def get_ref_X(ref_panel, overwrite=False): r''' Returns N_ref x M dim matrix of column-standardized genotypes of LD ref panel ''' X_bm_path = f'{bucket}/{ref_panel}.X.bm' if overwrite or not hl.hadoop_is_file(f'{X_bm_path}/_SUCCESS'): mt = hl.import_plink(bed=f'{bucket}/{ref_panel}.bed', bim=f'{bucket}/{ref_panel}.bim', fam=f'{bucket}/{ref_panel}.fam') mt = mt.annotate_rows(stats=hl.agg.stats(mt.GT.n_alt_alleles())) mt = mt.annotate_entries(X=(mt.GT.n_alt_alleles() - mt.stats.mean) / mt.stats.stdev) X = BlockMatrix.from_entry_expr(mt.X) X = X.T X.write(f'{bucket}/{ref_panel}.X.bm', overwrite=True) X = BlockMatrix.read(X_bm_path) return X
param_suffix = f'{gt_sim_suffix}.h2_{h2}.pi_{pi}.K_{K}.seed_{seed}' betas_path = f'{smiles_wd}/betas.{param_suffix}.tsv.gz' phens_path = f'{smiles_wd}/phens.{param_suffix}.tsv.gz' if sim_name[:3] == 'bn_': mt = hl.balding_nichols_model(n_populations=n_pops, n_samples=n_sim, n_variants=n_vars, fst=fst) mt = mt.filter_rows( (hl.abs(hl.agg.mean(mt.GT.n_alt_alleles()) / 2 - 0.5) < 0.5)) # remove invariant SNPs mt = mt.annotate_cols(s=hl.str(mt.sample_idx)) if hl.hadoop_is_file(betas_path) and hl.hadoop_is_file(phens_path): # betas = hl.import_table(betas_path, impute=True, force=True) # betas = betas.annotate(locus = hl.parse_locus(betas.locus), # alleles = betas.alleles.replace('\[\"','').replace('\"\]','').split('\",\"')) # betas = betas.key_by('locus','alleles') phens = hl.import_table(phens_path, key=['s'], types={'s': hl.tstr}, impute=True, force=True) sim_mt = mt.annotate_cols(y_binarized=phens[mt.s].y_binarized) else: sim_mt = get_sim_mt(mt=mt, h2=h2, pi=pi, K=K)
def run_method(p, pop, not_pop, max_pops, pheno_key_dict, pheno_id, hail_script, output_txt, output_ht, ss_dict, method): r''' Runs either PLINK clump (method = 'clump') or SBayesR (method = 'sbayesr') ''' assert method in {'clump', 'sbayesr'} task_suffix = (f'{"not_" if not_pop else ""}{pop}' if not max_pops else 'max_pops') + f'-{pheno_id}' # TODO: if method = 'sbayesr' check if LD matrix has already been calculated tasks = [] ref_subset = '-'.join(pheno_key_dict['pops'] if max_pops else ( [p for p in POPS if p is not pop] if not_pop else [pop])) print(f'Using LD reference panel of {ref_subset}') ## run plink clumping for chrom, ss_chrom in ss_dict.items(): ## read ref ld plink files bfile = read_plink_input_group_chrom(p=p, method=method, subset=ref_subset, chrom=chrom) get_betas = p.new_job(name=f'{method}_{task_suffix}_chr{chrom}') # TODO: change image to include GCTB if running SBayesR? get_betas.cpu(1) # plink clump cannot multithread get_betas.command('set -ex') if method == 'clump': get_betas.storage('5G') # default: 5G # clump_memory = -15*(chrom-1)+400 # Memory requested for PLINK clumping in MB. equation: -15*(chrom-1) + 500 is based on 400 MB for chr 1, 80 MB for chr 22 clump_memory = 3.75 # in GB get_betas.memory(clump_memory) # default: 30G get_betas.command(f'head {ss_chrom}') get_betas.command(' '.join([ 'plink', '--bfile', str(bfile), '--memory', str(clump_memory * 1000), # memory in MB '--threads', '1', # explicitly set threads to 1 '--clump', ss_chrom, '--clump-field P', '--clump-snp-field SNP', '--clump-p1 1', '--clump-p2 1', '--clump-r2 0.1', '--clump-kb 500', '--output-chr M', # necessary to code chr X as 'X' instead of '23', which isn't allowed as a contig in Hail's GRCh37 locus '--chr', str(chrom), '--out', f'{get_betas.ofile}_tmp' ])) get_betas.command(' '.join([ 'awk', "'{ print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12 }'", "OFS='\t'", f'{get_betas.ofile}_tmp.clumped', '2>', '/dev/null', '|', 'tail -n+2', '|', # don't include header "sed '/^[[:space:]]*$/d'", # remove 2 empty lines created by PLINK at the end of the output file '>', str(get_betas.ofile) ])) elif method == 'sbayesr': ldm_type = 'full' # options: full, sparse ldm_path = f'{ldprune_dir}/subsets_50k/not_{pop}/ldm/not_{pop}.hm3.chr{chrom}.maf_gt_0.ldm.{ldm_type}' # ldm_path = f'{ldprune_dir}/subsets_50k/not_{pop}/ldm/not_{pop}.hm3.chr{chrom}.maf_gt_0.chisq_5.ldm.sparse' if hl.hadoop_is_file(f'{ldm_path}.info') and hl.hadoop_is_file( f'{ldm_path}.bin'): ldm = p.read_input_group(info=f'{ldm_path}.info', bin=f'{ldm_path}.bin') else: make_ldm = p.new_job( name=f'make_{ldm_type}_ldm_{task_suffix}.chr{chrom}') make_ldm.memory('60G') make_ldm.command(' '.join([ 'wget', 'https://cnsgenomics.com/software/gctb/download/gctb_2.0_Linux.zip', '-P', '~/' ])) make_ldm.command(' '.join( ['unzip', '~/gctb_2.0_Linux.zip', '-d', '~/'])) make_ldm.command(' '.join(['ls', '-ltrR', '~/'])) make_ldm.command(' '.join( ['mv', '~/gctb_2.0_Linux/gctb', '/usr/local/bin/'])) make_ldm.command(' '.join([ 'plink', '--bfile', str(bfile), '--maf 0.0000000001', '--make-bed', '--out', f'{make_ldm.ofile}_tmp1' ])) make_ldm.command(' '.join([ 'gctb', '--bfile', f'{make_ldm.ofile}_tmp1', # '--snp 1-1000', f'--make-{ldm_type}-ldm', '--out', f'{make_ldm.ofile}_tmp2' ])) # TODO: use both .bin and .info files make_ldm.command(' '.join([ 'mv', f'{make_ldm.ofile}_tmp2.ldm.{ldm_type}', str(make_ldm.ofile) ])) p.write_output(make_ldm.ofile, ldm_path) ldm = make_ldm.ofile get_betas.declare_resource_group( out={ 'log': '{root}.log', 'snpRes': '{root}.snpRes', 'parRes': '{root}.parRes', 'mcmcsamples.SnpEffects': '{root}.mcmcsamples.SnpEffects', 'mcmcsamples.Par': '{root}.mcmcsamples.Par' }) get_betas.command(' '.join([ 'wget', 'https://cnsgenomics.com/software/gctb/download/gctb_2.0_Linux.zip', '-P', '~/' ])) get_betas.memory('18G') get_betas.command(' '.join( ['unzip', '~/gctb_2.0_Linux.zip', '-d', '~/'])) get_betas.command(' '.join(['ls', '-ltrR', '~/'])) get_betas.command(' '.join( ['mv', '~/gctb_2.0_Linux/gctb', '/usr/local/bin/'])) get_betas.command(' '.join([ 'gctb', '--sbayes R', '--ldm', str(ldm), '--pi 0.95,0.02,0.02,0.01', '--gamma 0.0,0.01,0.1,1', '--gwas-summary', f' <( gunzip -c {ss_chrom} | grep -v "NA" )', '--chain-length 10000', '--burn-in 2000', '--out-freq 10', '--out', f'{get_betas.out}' ])) get_betas.command(' '.join(['head', f'{get_betas.out}.snpRes'])) get_betas.command(' '.join( ['mv', f'{get_betas.out}.snpRes', str(get_betas.ofile)])) tasks.append(get_betas) get_betas_sink = p.new_job(name=f'{method}_sink_{task_suffix}') get_betas_sink.command( f'cat {" ".join([t.ofile for t in tasks])} > {get_betas_sink.ofile}' ) # this task implicitly depends on the chromosome scatter tasks p.write_output(get_betas_sink.ofile, output_txt) ## import as hail table and save n_threads = 8 tsv_to_ht = p.new_job(name=f'{method}_to_ht_{task_suffix}') tsv_to_ht = tsv_to_ht.image( 'gcr.io/ukbb-diversepops-neale/nbaya_hail:latest') tsv_to_ht.storage('1G') tsv_to_ht.memory('100M') tsv_to_ht.cpu(n_threads) tsv_to_ht.depends_on(get_betas_sink) tsv_to_ht.command('set -ex') tsv_to_ht.command(' '.join([ 'PYTHONPATH=$PYTHONPATH:/', 'PYSPARK_SUBMIT_ARGS="--conf spark.driver.memory=4g --conf spark.executor.memory=24g pyspark-shell"' ])) tsv_to_ht.command(' '.join([ 'python3', str(hail_script), '--input_file', f'"{output_txt}"', # output_txt must be doubly enclosed by quotes needed for files with "|" in their pheno_id '--tsv_to_ht', '--trait_type', f'''"{pheno_key_dict['trait_type']}"''', '--phenocode', f'''"{pheno_key_dict['phenocode']}"''', '--pheno_sex', f'''"{pheno_key_dict['pheno_sex']}"''', '--output_file', f'"{output_ht}"', # output_ht must be doubly enclosed by quotes needed for files with "|" in their pheno_id '--overwrite' ] + (['--coding', f'''"{pheno_key_dict['coding']}"'''] if pheno_key_dict['coding'] != '' else []) + ( ['--modifier', f'''"{pheno_key_dict['modifier']}"'''] if pheno_key_dict['modifier'] != '' else [])))
def main(): rnaseq_sample_metadata_df = get_joined_metadata_df() p = argparse.ArgumentParser() grp = p.add_mutually_exclusive_group(required=True) grp.add_argument("--local", action="store_true", help="Batch: run locally") grp.add_argument("--cluster", action="store_true", help="Batch: submit to cluster") p.add_argument( "--batch-billing-project", default="tgg-rare-disease", help="Batch: billing project. Required if submitting to cluster.") p.add_argument("--batch-job-name", help="Batch: (optional) job name") p.add_argument( "-f", "--force", action="store_true", help="Recompute and overwrite cached or previously computed data") grp = p.add_mutually_exclusive_group(required=True) grp.add_argument("-b", "--rnaseq-batch-name", nargs="*", help="RNA-seq batch names to process", choices=set( rnaseq_sample_metadata_df['star_pipeline_batch'])) grp.add_argument("-s", "--rnaseq-sample-id", nargs="*", help="RNA-seq sample IDs to process", choices=set(rnaseq_sample_metadata_df['sample_id'])) args = p.parse_args() #logger.info("\n".join(df.columns)) if args.rnaseq_batch_name: batch_names = args.rnaseq_batch_name sample_ids = rnaseq_sample_metadata_df[rnaseq_sample_metadata_df[ 'star_pipeline_batch'].isin(batch_names)].sample_id elif args.rnaseq_sample_id: sample_ids = args.rnaseq_sample_id logger.info( f"Processing {len(sample_ids)} sample ids: {', '.join(sample_ids)}") # see https://hail.zulipchat.com/#narrow/stream/223457-Batch-support/topic/auth.20as.20user.20account for more details if args.local: backend = hb.LocalBackend(gsa_key_file=os.path.expanduser( "~/.config/gcloud/misc-270914-cb9992ec9b25.json")) else: backend = hb.ServiceBackend(args.batch_billing_project) b = hb.Batch(backend=backend, name=args.batch_job_name) # define workflow inputs if args.local: genes_gtf = b.read_input("gencode.v26.annotation.gff3", extension=".gff3") else: genes_gtf = b.read_input( "gs://macarthurlab-rnaseq/ref/gencode.v26.annotation.GRCh38.gff3", extension=".gff3") # define parallel execution for samples for sample_id in sample_ids: metadata_row = rnaseq_sample_metadata_df.loc[sample_id] batch_name = metadata_row['star_pipeline_batch'] # set job inputs & outputs input_read_data = b.read_input_group( bam=metadata_row['star_bam'], bai=metadata_row['star_bai'], ) output_dir = f"gs://macarthurlab-rnaseq/{batch_name}/majiq_build/" output_file_path = os.path.join(output_dir, f"majiq_build_{sample_id}.tar.gz") # check if output file already exists if hl.hadoop_is_file(output_file_path) and not args.force: logger.info( f"{sample_id} output file already exists: {output_file_path}. Skipping..." ) continue file_stats = hl.hadoop_stat(metadata_row['star_bam']) bam_size = int(round(file_stats['size_bytes'] / 10.**9)) # define majiq build commands for this sample j = b.new_job(name=args.batch_job_name) j.image("weisburd/majiq:latest") j.storage(f'{bam_size*3}Gi') j.cpu(1) # default: 1 j.memory("15G") # default: 3.75G logger.info( f'Requesting: {j._storage or "default"} storage, {j._cpu or "default"} CPU, {j._memory or "default"} memory' ) # switch to user account j.command( f"gcloud auth activate-service-account --key-file /gsa-key/key.json" ) j.command( f"gsutil -m cp -r {GCLOUD_CREDENTIALS_LOCATION}/.config /tmp/") j.command(f"rm -rf ~/.config") j.command(f"mv /tmp/.config ~/") j.command(f"gcloud config set account {GCLOUD_USER_ACCOUNT}") j.command(f"gcloud config set project {GCLOUD_PROJECT}") # run majiq build #j.command(f"gsutil -u {GCLOUD_PROJECT} -m cp gs://gtex-resources/GENCODE/gencode.v26.GRCh38.ERCC.genes.collapsed_only.gtf .") j.command(f"mv {genes_gtf} gencode.gff3") j.command(f"mv {input_read_data.bam} {sample_id}.bam") j.command(f"mv {input_read_data.bai} {sample_id}.bam.bai") j.command(f"echo '[info]' >> majiq_build.cfg") j.command( f"echo 'readlen={metadata_row['read length (rnaseqc)']}' >> majiq_build.cfg" ) j.command(f"echo 'bamdirs=.' >> majiq_build.cfg") j.command(f"echo 'genome=hg38' >> majiq_build.cfg") j.command( f"echo 'strandness={'None' if metadata_row['stranded? (rnaseqc)'] == 'no' else 'reverse'}' >> majiq_build.cfg" ) j.command(f"echo '[experiments]' >> majiq_build.cfg") j.command(f"echo '{sample_id}={sample_id}' >> majiq_build.cfg") j.command(f"cat majiq_build.cfg >> {j.logfile}") j.command( f"majiq build gencode.gff3 -c majiq_build.cfg -j 1 -o majiq_build_{sample_id} >> {j.logfile}" ) j.command( f"tar czf majiq_build_{sample_id}.tar.gz majiq_build_{sample_id}") j.command(f"cp majiq_build_{sample_id}.tar.gz {j.output_tar_gz}") #j.command(f"ls -lh . >> {j.logfile}") #j.command(f"echo ls majiq_build_{sample_id} >> {j.logfile}") #j.command(f"ls -1 majiq_build_{sample_id} >> {j.logfile}") j.command(f"echo --- done {output_file_path} >> {j.logfile}") # copy output b.write_output(j.output_tar_gz, output_file_path) b.write_output( j.logfile, os.path.join(output_dir, f"majiq_build_{sample_id}.log")) b.run() if isinstance(backend, hb.ServiceBackend): backend.close()
def get_sumstats(p, pop: str, not_pop: bool, max_pops: bool, pops: list, high_quality: bool, pheno_id: str, method: str, chromosomes: list = all_chromosomes): r''' Returns a dict of per-chromosome summary statistics output files. ''' assert not (not_pop and max_pops), '`not_pop` and `max_pops` cannot both be True' assert method in {'clump', 'sbayesr'} if max_pops and len(pops) == 1 and pop is None: pop = pops[ 0] # need to set this variable in order to find column indices later num_pops = len(pops) filename = f'{pheno_id}.tsv.bgz' trait_type = pheno_id.split('-')[0] trait_category = 'quant' if trait_type in ['continuous', 'biomarkers' ] else 'binary' variant_manifest = p.read_input( f'{ldprune_dir}/variant_qc/full_variant_qc_metrics.txt.bgz') variant_manifest_tabix = p.read_input( f'{ldprune_dir}/variant_qc_tabix/full_variant_qc_metrics.txt.bgz.tbi') loo_6pop_dir = f'{ldprune_dir}/loo/sumstats/batch2' loo_6pop_ss_fname = f'{loo_6pop_dir}/{filename}' loo_6pop_tabix_fname = f'{loo_6pop_dir}_tabix/{filename}.tbi' ss_dir = f'{bucket}/sumstats_flat_files' ss_fname = f'{ss_dir}/{filename}' tabix_fname = f'{ss_dir}_tabix/{filename}.tbi' get_ss = p.new_job(name=f'get_ss_{pheno_id}') get_ss = get_ss.image('gcr.io/ukbb-diversepops-neale/nbaya_tabix:latest') get_ss.storage('100M') # default: 1G get_ss.cpu(1) bgz_fname = f'{get_ss.ofile}.bgz' tbi_fname = f'{get_ss.ofile}.bgz.tbi' get_ss.command('set -ex') variant_manifest_bgz = f'{get_ss.ofile}.variants.bgz' variant_manifest_tbi = f'{get_ss.ofile}.variants.bgz.tbi' get_ss.command(' '.join(['mv', variant_manifest, variant_manifest_bgz])) get_ss.command(' '.join( ['mv', variant_manifest_tabix, variant_manifest_tbi])) if not_pop and hl.hadoop_is_file(loo_6pop_ss_fname) and hl.hadoop_is_file( loo_6pop_tabix_fname ): # phenotype is 6-pop and has leave-one-out sumstats generated. # assert False, "don't run 6-pop LOO" print( f'Using 6-pop LOO sumstats for {pheno_id} ({"not_" if not_pop else ""}{pop})' ) ss = p.read_input(loo_6pop_ss_fname) tabix = p.read_input(loo_6pop_tabix_fname) get_ss.command(' '.join([ 'mv', ss, bgz_fname ])) # necessary instead of changing path extension for input files get_ss.command(' '.join([ 'mv', tabix, tbi_fname ])) # necessary instead of changing path extension for input files get_ss.command('\n'.join(f''' tabix -h {bgz_fname} {chrom} | \\ cut -f5,{6+POPS.index(pop)} | \\ sed 's/pval_not_{pop}/P/g' | \\ awk '$2!="NA" {{print}}' > {get_ss[f'ofile_{chrom}']} ''' for chrom in chromosomes)) elif hl.hadoop_is_file(ss_fname) and hl.hadoop_is_file( tabix_fname ): # this conditional block must come after checking for 6-pop LOO results print(f'Using {num_pops}-pop sumstats for {pheno_id} ' + ('({"not_" if not_pop else ""}{pop})' if not max_pops else '(max_pops=True)')) ss = p.read_input(ss_fname) tabix = p.read_input(tabix_fname) get_ss.command(' '.join([ 'mv', ss, bgz_fname ])) # necessary instead of changing path extension for input files get_ss.command(' '.join([ 'mv', tabix, tbi_fname ])) # necessary instead of changing path extension for input files if not_pop or (max_pops and len(pops) > 1): # if clumping on meta-analyzed sumstats pval_col_idx = 8 if trait_category == 'quant' else 9 # due to additional AF columns in binary traits, pvalue column location may change awk_arg1 = '' awk_arg2 = '$2!="NA"' + ( '&& $3!="false"' if high_quality else '' ) # exclude pval(col 2)=NA; if high_quality: exclude high_quality(col 3)=false sed_arg = "-e 's/pval_meta/P/g'" else: # if clumping single population results pval_col_idx = (4 + ( (4 + (trait_category == 'binary') + 1) if num_pops > 1 else 0) + ((trait_category == 'binary') + 3) * num_pops + pops.index(pop) + 1) low_confidence_col_idx = ( 4 + # first 4 cols ((4 + (trait_category == 'binary') + 1) if num_pops > 1 else 0) + # meta-analysis fields ((trait_category == 'binary') + 4) * num_pops + # per-pop fields pops.index(pop) + 1) awk_arg1 = f', $3=${low_confidence_col_idx}' awk_arg2 = '$2!="NA" && $3!="true"' + ( ' && $4!="false"' if high_quality else '' ) # exclude pval(col 2)=NA, low_confidence(col 3)=True; if high_quality: exclude high_quality(col 4)=False sed_arg = f"-e 's/pval_{pop}/P/g'" # sed argument for replacing column name # TODO: If possible, consolidate the following blocks if high_quality: get_ss.command('\n'.join(f''' paste <( tabix -h {bgz_fname} {chrom} | \\ awk '{{print $1=$1":"$2":"$3":"$4, $2=${pval_col_idx}{awk_arg1}}}' | \\ sed -e 's/chr:pos:ref:alt/SNP/g' {sed_arg} ) \\ <( tabix -h {variant_manifest_bgz} {chrom} | \\ awk '{{ print $9 }}' ) | \\ awk '{{if({awk_arg2}) print $1,$2}}' > {get_ss[f"ofile_{chrom}"]} ''' for chrom in chromosomes)) else: get_ss.command('\n'.join(f''' tabix -h {bgz_fname} {chrom} | \\ awk '{{print $1=$1":"$2":"$3":"$4, $2=${pval_col_idx}{awk_arg1}}}' | \\ sed -e 's/chr:pos:ref:alt/SNP/g' {sed_arg} | \\ awk '{{if({awk_arg2}) print $1,$2}}' > {get_ss[f"ofile_{chrom}"]} ''' for chrom in chromosomes)) ss_dict = {chrom: get_ss[f'ofile_{chrom}'] for chrom in chromosomes} return ss_dict
def main(): rnaseq_sample_metadata_df = get_joined_metadata_df() #gtex_rnaseq_sample_metadata_df = get_gtex_rnaseq_sample_metadata_df() p = batch_utils.init_arg_parser( default_cpu=4, gsa_key_file=os.path.expanduser( "~/.config/gcloud/misc-270914-cb9992ec9b25.json")) grp = p.add_mutually_exclusive_group(required=True) grp.add_argument( "-b", "--rnaseq-batch-name", nargs="*", help="RNA-seq batch names to process (eg. -b batch1 batch2)", choices=set(rnaseq_sample_metadata_df['star_pipeline_batch']) | set(["gtex_muscle", "gtex_fibroblasts", "gtex_blood"])) grp.add_argument( "-s", "--rnaseq-sample-id", nargs="*", help="RNA-seq sample IDs to process (eg. -s sample1 sample2)", choices=set(rnaseq_sample_metadata_df['sample_id']) | set([ 'GTEX-1LG7Z-0005-SM-DKPQ6', 'GTEX-PX3G-0006-SM-5SI7E', 'GTEX-1KXAM-0005-SM-DIPEC' ])) args = p.parse_args() # Generate samples_df with these columns: sample_id, star_SJ_out_tab, output_dir, batch_name samples_df = pd.DataFrame() if args.rnaseq_batch_name: for batch_name in args.rnaseq_batch_name: df = rnaseq_sample_metadata_df[ rnaseq_sample_metadata_df['star_pipeline_batch'] == batch_name] samples_df = transfer_metadata_columns_from_df(samples_df, df) elif args.rnaseq_sample_id: df = rnaseq_sample_metadata_df[ rnaseq_sample_metadata_df.sample_id.isin(set( args.rnaseq_sample_id))] samples_df = transfer_metadata_columns_from_df(samples_df, df) else: p.error("Must specify -b or -s") logger.info( f"Processing {len(samples_df)} sample ids: {', '.join(samples_df.sample_id[:20])}" ) # see https://hail.zulipchat.com/#narrow/stream/223457-Batch-support/topic/auth.20as.20user.20account for more details with batch_utils.run_batch(args) as batch: for sample_id in samples_df.sample_id: metadata_row = samples_df.loc[sample_id] # set job inputs & outputs output_dir = metadata_row['output_dir'] print("Input file: ", metadata_row['star_SJ_out_tab']) output_filename = f"{sample_id}.junctions.bed.gz" output_bed_gz_file_path = os.path.join(output_dir, output_filename) # check if output file already exists if hl.hadoop_is_file(output_bed_gz_file_path) and not args.force: logger.info( f"{sample_id} output file already exists: {output_bed_gz_file_path}. Skipping..." ) continue j = batch_utils.init_job(batch, name=f"tab=>bed: {sample_id}", cpu=args.cpu, memory=args.memory, disk_size=5, image=DOCKER_IMAGE) batch_utils.switch_gcloud_auth_to_user_account( j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT, GCLOUD_PROJECT) j.command( f"gsutil -u {GCLOUD_PROJECT} -m cp {metadata_row['star_SJ_out_tab']} ." ) j.command( f"gsutil -u {GCLOUD_PROJECT} -m cp gs://macarthurlab-rnaseq/ref/gencode.v26.annotation.gff3.gz ." ) j.command(f"pwd && ls && date") j.command( f"python3 /convert_SJ_out_tab_to_junctions_bed.py -g gencode.v26.annotation.gff3.gz {os.path.basename(metadata_row['star_SJ_out_tab'])}" ) j.command(f"cp {output_filename} {j.output_bed_gz}") j.command(f"cp {output_filename}.tbi {j.output_bed_gz_tbi}") j.command(f"echo Done: {output_bed_gz_file_path}") j.command(f"date") # copy output batch.write_output(j.output_bed_gz, output_bed_gz_file_path) batch.write_output(j.output_bed_gz_tbi, f"{output_bed_gz_file_path}.tbi") print("Output file path: ", output_bed_gz_file_path)
def main(): p, args = parse_args() df = pd.read_table(args.cram_and_tsv_paths_table) if {"sample_id", "cram_path", "crai_path", "variants_tsv_bgz"} - set( df.columns): p.error( f"{args.tsv_path} must contain 'sample_id', 'cram_path' columns") # check that all buckets are in "US-CENTRAL1" or are multi-regional to avoid egress charges to the Batch cluster batch_utils.set_gcloud_project(GCLOUD_PROJECT) if args.cluster: batch_utils.check_storage_bucket_region(df.cram_path) if not args.force: hl.init(log="/dev/null", quiet=True) # process samples with batch_utils.run_batch(args, batch_name=f"HaplotypeCaller -bamout") as batch: counter = 0 for _, row in tqdm.tqdm(df.iterrows(), unit=" rows", total=len(df)): if args.sample_to_process and row.sample_id not in set( args.sample_to_process): continue input_filename = os.path.basename(row.cram_path) output_prefix = input_filename.replace(".bam", "").replace(".cram", "") output_bam_path = os.path.join(args.output_dir, f"{output_prefix}.bamout.bam") output_bai_path = os.path.join(args.output_dir, f"{output_prefix}.bamout.bai") if not args.force and hl.hadoop_is_file( output_bam_path) and hl.hadoop_is_file(output_bai_path): logger.info( f"Output files exist (eg. {output_bam_path}). Skipping {input_filename}..." ) continue counter += 1 if args.num_samples_to_process and counter > args.num_samples_to_process: break j = batch_utils.init_job(batch, f"readviz: {row.sample_id}", DOCKER_IMAGE if not args.raw else None, args.cpu, args.memory) batch_utils.switch_gcloud_auth_to_user_account( j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT) local_exclude_intervals = batch_utils.localize_file( j, EXCLUDE_INTERVALS) local_fasta = batch_utils.localize_file( j, batch_utils.HG38_REF_PATHS.fasta, use_gcsfuse=True) local_fasta_fai = batch_utils.localize_file( j, batch_utils.HG38_REF_PATHS.fai, use_gcsfuse=True) batch_utils.localize_file(j, batch_utils.HG38_REF_PATHS.dict, use_gcsfuse=True) local_tsv_bgz = batch_utils.localize_file(j, row.variants_tsv_bgz) local_cram_path = batch_utils.localize_file(j, row.cram_path) local_crai_path = batch_utils.localize_file(j, row.crai_path) j.command(f"""echo -------------- echo "Start - time: $(date)" df -kh # 1) Convert variants_tsv_bgz to sorted interval list gunzip -c "{local_tsv_bgz}" | awk '{{ OFS="\t" }} {{ print( "chr"$1, $2, $2 ) }}' | bedtools slop -b {PADDING_AROUND_VARIANT} -g {local_fasta_fai} > variant_windows.bed # Sort the .bed file so that chromosomes are in the same order as in the input_cram file. # Without this, if the input_cram has a different chromosome ordering (eg. chr1, chr10, .. vs. chr1, chr2, ..) # than the interval list passed to GATK tools' -L arg, then GATK may silently skip some of regions in the -L intervals. # The sort is done by first retrieving the input_cram header and passing it to GATK BedToIntervalList. java -Xms2g -jar /gatk/gatk.jar PrintReadsHeader \ --gcs-project-for-requester-pays {GCLOUD_PROJECT} \ -R {local_fasta} \ -I "{local_cram_path}" \ -O header.bam java -Xms2g -jar /gatk/gatk.jar BedToIntervalList \ --SORT true \ --SEQUENCE_DICTIONARY header.bam \ --INPUT variant_windows.bed \ --OUTPUT variant_windows.interval_list # 2) Get reads from the input_cram for the intervals in variant_windows.interval_list time java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+DisableAttachMechanism -XX:MaxHeapSize=2000m -Xmx30000m \ -jar /gatk/GATK35.jar \ -T HaplotypeCaller \ -R {local_fasta} \ -I "{local_cram_path}" \ -L variant_windows.interval_list \ -XL {local_exclude_intervals} \ --disable_auto_index_creation_and_locking_when_reading_rods \ -ERC GVCF \ --max_alternate_alleles 3 \ -variant_index_parameter 128000 \ -variant_index_type LINEAR \ --read_filter OverclippedRead \ -bamout "{output_prefix}.bamout.bam" \ -o "{output_prefix}.gvcf" |& grep -v "^DEBUG" bgzip "{output_prefix}.gvcf" tabix "{output_prefix}.gvcf.gz" gsutil -m cp "{output_prefix}.bamout.bam" {args.output_dir} gsutil -m cp "{output_prefix}.bamout.bai" {args.output_dir} gsutil -m cp "{output_prefix}.gvcf.gz" {args.output_dir} gsutil -m cp "{output_prefix}.gvcf.gz.tbi" {args.output_dir} ls -lh echo --------------; free -h; df -kh; uptime; set +xe; echo "Done - time: $(date)"; echo -------------- """)
def test_hadoop_is_file(self): self.assertTrue(hl.hadoop_is_file(resource('ls_test/f_50'))) self.assertFalse(hl.hadoop_is_file(resource('ls_test/subdir'))) self.assertFalse(hl.hadoop_is_file(resource('ls_test/invalid-path')))
os.system(command) #grouped_gcnv_cluster_to_sample_bed_paths #%% for cluster_name, paths in sorted( grouped_gcnv_cluster_to_sample_bed_paths.items(), key=lambda t: len(t[1])): #, reverse=True): print(f"Processing {cluster_name} which has {len(paths)} samples") #for i in range(len(paths)//250): # cluster_df = None # for path in tqdm.tqdm(paths[i*250:(i+1)*250], unit=" paths"): cluster_bed_bucket_path = f"gs://seqr-datasets-gcnv/GRCh38/RDG_WES_Broad_Internal/v3/beds/{cluster_name}.bed.gz" if hl.hadoop_is_file(f"{cluster_bed_bucket_path}.tbi"): print(f"{cluster_bed_bucket_path} already exists. Skipping..") continue cluster_df = None paths.sort(key=lambda path: os.path.basename(path)) for path in tqdm.tqdm(paths, unit=" paths"): sample_name = os.path.basename(path).replace("denoised_copy_ratios-", "") sample_name = re.sub(".tsv$", "", sample_name) column_name = sample_name if cluster_df is not None: while column_name in set(cluster_df.columns): print(f"WARNING: Duplicate sample name: {column_name} {path}") column_name += "_2"
def main(args): hl.init(default_reference=args.default_ref_genome) if args.run_test_mode: logger.info('Running pipeline on test data...') mt = (get_mt_data(part='raw_chr20').sample_rows(0.1)) else: logger.info( 'Running pipeline on MatrixTable wih adjusted genotypes...') ds = args.exome_cohort mt = hl.read_matrix_table( get_qc_mt_path(dataset=ds, part='unphase_adj_genotypes', split=True)) # 1. Sample-QC filtering if not args.skip_sample_qc_filtering: logger.info('Applying per sample QC filtering...') mt = apply_sample_qc_filtering(mt) logger.info( 'Writing sample qc-filtered mt with rare variants (internal maf 0.01) to disk...' ) mt = (mt.write(f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt', overwrite=True)) # 2. Variant-QC filtering if not args.skip_variant_qc_filtering: logger.info('Applying per variant QC filtering...') if hl.hadoop_is_file( f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt/_SUCCESS'): logger.info('Reading pre-existing sample qc-filtered MT...') mt = hl.read_matrix_table( f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt') mt = apply_variant_qc_filtering(mt) # write hard filtered MT to disk logger.info( 'Writing variant qc-filtered mt with rare variants (internal maf 0.01) to disk...' ) mt = (mt.write(f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt', overwrite=True)) # 3. Annotate AFs # allelic frequency cut-off maf_cutoff = args.af_max_threshold if not args.skip_af_filtering: if hl.hadoop_is_file( f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt/_SUCCESS'): logger.info( 'Reading pre-existing sample/variant qc-filtered MT...') mt = hl.read_matrix_table( f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt') # Annotate allelic frequencies from external source, # and compute internal AF on samples passing QC af_ht = get_af_annotation_ht() mt = (mt.annotate_rows(**af_ht[mt.row_key])) filter_expressions = [ af_filter_expr(mt, 'internal_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'gnomad_genomes_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'gnomAD_AF', af_cutoff=maf_cutoff), af_filter_expr(mt, 'ger_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'rumc_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'bonn_af', af_cutoff=maf_cutoff) ] mt = (mt.filter_rows(functools.reduce(operator.iand, filter_expressions), keep=True)) logger.info( 'Writing qc-filtered MT filtered to external maf with to disk...') mt = (mt.write(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt', overwrite=True)) # 4. ##### Burden Test ###### logger.info('Running burden test...') if hl.hadoop_is_file(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt/_SUCCESS'): logger.info( 'Reading pre-existing sample/variant qc-filtered MT with rare variants...' ) mt = hl.read_matrix_table(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt') ## Add VEP-annotated fields vep_ht = get_vep_annotation_ht() mt = (mt.annotate_rows(LoF=vep_ht[mt.row_key].vep.LoF, Consequence=vep_ht[mt.row_key].vep.Consequence, DOMAINS=vep_ht[mt.row_key].vep.DOMAINS, SYMBOL=vep_ht[mt.row_key].vep.SYMBOL)) ## Filter to bi-allelic variants if args.filter_biallelic: logger.info('Running burden test on biallelic variants...') mt = mt.filter_rows(bi_allelic_expr(mt)) ## Filter to variants within protein domain(s) if args.filter_protein_domain: logger.info( 'Running burden test on variants within protein domain(s)...') mt = mt.filter_rows(vep_protein_domain_filter_expr(mt.DOMAINS), keep=True) ## Add cases/controls sample annotations tb_sample = get_sample_meta_data() mt = (mt.annotate_cols(**tb_sample[mt.s])) mt = (mt.filter_cols(mt['phe.is_case'] | mt['phe.is_control'])) ## Annotate pathogenic scores ht_scores = get_vep_scores_ht() mt = mt.annotate_rows(**ht_scores[mt.row_key]) ## Classify variant into (major) consequence groups score_expr_ann = { 'hcLOF': mt.LoF == 'HC', 'syn': mt.Consequence == 'synonymous_variant', 'miss': mt.Consequence == 'missense_variant' } # Update dict expr annotations with combinations of variant consequences categories score_expr_ann.update({ 'missC': (hl.sum([(mt['vep.MVP_score'] >= MVP_THRESHOLD), (mt['vep.REVEL_score'] >= REVEL_THRESHOLD), (mt['vep.CADD_PHRED'] >= CADD_THRESHOLD)]) >= 2) & score_expr_ann.get('miss') }) score_expr_ann.update({ 'hcLOF_missC': score_expr_ann.get('hcLOF') | score_expr_ann.get('missC') }) mt = (mt.annotate_rows(csq_group=score_expr_ann)) # Transmute csq_group and convert dict to set where the group is defined # (easier to explode and grouping later) mt = (mt.transmute_rows(csq_group=hl.set( hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys())))) mt = (mt.filter_rows(hl.len(mt.csq_group) > 0)) # Explode nested csq_group before grouping mt = (mt.explode_rows(mt.csq_group)) # print('Number of samples/variants: ') # print(mt.count()) # Group mt by gene/csq_group. mt_grouped = (mt.group_rows_by(mt['SYMBOL'], mt['csq_group']).aggregate( hets=hl.agg.any(mt.GT.is_het()), homs=hl.agg.any(mt.GT.is_hom_var()), chets=hl.agg.count_where(mt.GT.is_het()) >= 2, homs_chets=(hl.agg.count_where(mt.GT.is_het()) >= 2) | (hl.agg.any(mt.GT.is_hom_var()))).repartition(100).persist()) mts = [] if args.homs: # select homs genotypes. mt_homs = (mt_grouped.select_entries( mac=mt_grouped.homs).annotate_rows(agg_genotype='homs')) mts.append(mt_homs) if args.chets: # select compound hets (chets) genotypes. mt_chets = (mt_grouped.select_entries( mac=mt_grouped.chets).annotate_rows(agg_genotype='chets')) mts.append(mt_chets) if args.homs_chets: # select chets and/or homs genotypes. mt_homs_chets = (mt_grouped.select_entries( mac=mt_grouped.homs_chets).annotate_rows( agg_genotype='homs_chets')) mts.append(mt_homs_chets) if args.hets: # select hets genotypes mt_hets = (mt_grouped.select_entries( mac=mt_grouped.hets).annotate_rows(agg_genotype='hets')) mts.append(mt_hets) ## Joint MatrixTables mt_grouped = hl.MatrixTable.union_rows(*mts) # Generate table of counts tb_gene = (mt_grouped.annotate_rows( n_cases=hl.agg.filter(mt_grouped['phe.is_case'], hl.agg.sum(mt_grouped.mac)), n_syndromic=hl.agg.filter(mt_grouped['phe.is_syndromic'], hl.agg.sum(mt_grouped.mac)), n_nonsyndromic=hl.agg.filter(mt_grouped['phe.is_nonsyndromic'], hl.agg.sum(mt_grouped.mac)), n_controls=hl.agg.filter(mt_grouped['phe.is_control'], hl.agg.sum(mt_grouped.mac)), n_total_cases=hl.agg.filter(mt_grouped['phe.is_case'], hl.agg.count()), n_total_syndromic=hl.agg.filter(mt_grouped['phe.is_syndromic'], hl.agg.count()), n_total_nonsyndromic=hl.agg.filter(mt_grouped['phe.is_nonsyndromic'], hl.agg.count()), n_total_controls=hl.agg.filter(mt_grouped['phe.is_control'], hl.agg.count())).rows()) # run fet stratified by proband type analysis = ['all_cases', 'syndromic', 'nonsyndromic'] tbs = [] for proband in analysis: logger.info(f'Running test for {proband}...') colCases = None colTotalCases = None colControls = 'n_controls' colTotalControls = 'n_total_controls' if proband == 'all_cases': colCases = 'n_cases' colTotalCases = 'n_total_cases' if proband == 'syndromic': colCases = 'n_syndromic' colTotalCases = 'n_total_syndromic' if proband == 'nonsyndromic': colCases = 'n_nonsyndromic' colTotalCases = 'n_total_nonsyndromic' tb_fet = compute_fisher_exact(tb=tb_gene, n_cases_col=colCases, n_control_col=colControls, total_cases_col=colTotalCases, total_controls_col=colTotalControls, correct_total_counts=True, root_col_name='fet', extra_fields={ 'analysis': proband, 'maf': maf_cutoff }) # filter out zero-count genes tb_fet = (tb_fet.filter( hl.sum([tb_fet[colCases], tb_fet[colControls]]) > 0, keep=True)) tbs.append(tb_fet) tb_final = hl.Table.union(*tbs) tb_final.describe() # export results date = current_date() run_hash = str(uuid.uuid4())[:6] output_path = f'{args.output_dir}/{date}/{args.exome_cohort}.fet_burden.{run_hash}.ht' tb_final = (tb_final.checkpoint(output=output_path)) if args.write_to_file: # write table to disk as TSV file (tb_final.export(f'{output_path}.tsv')) hl.stop()
def main(): rnaseq_sample_metadata_df = get_joined_metadata_df() #gtex_rnaseq_sample_metadata_df = get_gtex_rnaseq_sample_metadata_df() p = batch_utils.init_arg_parser( default_cpu=4, gsa_key_file=os.path.expanduser( "~/.config/gcloud/misc-270914-cb9992ec9b25.json")) grp = p.add_mutually_exclusive_group(required=True) grp.add_argument( "-b", "--rnaseq-batch-name", nargs="*", help="RNA-seq batch names to process (eg. -b batch1 batch2)", choices=set(rnaseq_sample_metadata_df['star_pipeline_batch']) | set(["gtex_muscle", "gtex_fibroblasts", "gtex_blood"])) grp.add_argument( "-s", "--rnaseq-sample-id", nargs="*", help="RNA-seq sample IDs to process (eg. -s sample1 sample2)", choices=set(rnaseq_sample_metadata_df['sample_id']) | set([ 'GTEX-1LG7Z-0005-SM-DKPQ6', 'GTEX-PX3G-0006-SM-5SI7E', 'GTEX-1KXAM-0005-SM-DIPEC' ])) args = p.parse_args() # Generate samples_df with these columns: sample_id, bam_path, bai_path, output_dir, batch_name, sex, RIN, ancestry, etc. samples_df = pd.DataFrame() if args.rnaseq_batch_name: for batch_name in args.rnaseq_batch_name: df = rnaseq_sample_metadata_df[ rnaseq_sample_metadata_df['star_pipeline_batch'] == batch_name] samples_df = transfer_metadata_columns_from_df(samples_df, df) elif args.rnaseq_sample_id: df = rnaseq_sample_metadata_df[ rnaseq_sample_metadata_df.sample_id.isin(set( args.rnaseq_sample_id))] samples_df = transfer_metadata_columns_from_df(samples_df, df) else: p.error("Must specify -b or -s") logger.info( f"Processing {len(samples_df)} sample ids: {', '.join(samples_df.sample_id[:20])}" ) # see https://hail.zulipchat.com/#narrow/stream/223457-Batch-support/topic/auth.20as.20user.20account for more details with batch_utils.run_batch(args) as batch: for sample_id in samples_df.sample_id: metadata_row = samples_df.loc[sample_id] # set job inputs & outputs input_bam, input_bai = metadata_row['bam_path'], metadata_row[ 'bai_path'] output_dir = metadata_row['output_dir'] print("Input bam: ", input_bam) output_filename = f"{sample_id}.bigWig" output_file_path = os.path.join(output_dir, output_filename) # check if output file already exists if hl.hadoop_is_file(output_file_path) and not args.force: logger.info( f"{sample_id} output file already exists: {output_file_path}. Skipping..." ) continue file_stats = hl.hadoop_stat(metadata_row['bam_path']) bam_size = int(round(file_stats['size_bytes'] / 10.**9)) disk_size = bam_size * 2 j = batch_utils.init_job(batch, f"bam=>bigWig: {sample_id}", cpu=args.cpu, memory=args.memory, disk_size=disk_size, image=DOCKER_IMAGE) batch_utils.switch_gcloud_auth_to_user_account( j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT, GCLOUD_PROJECT) j.command( f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bam} {sample_id}.bam" ) j.command( f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bai} {sample_id}.bam.bai" ) j.command( f"gsutil -u {GCLOUD_PROJECT} -m cp gs://gtex-resources/references/GRCh38.chrsizes ." ) j.command(f"touch {sample_id}.bam.bai") j.command(f"pwd && ls && date") j.command( f"python3 /src/bam2coverage.py {sample_id}.bam GRCh38.chrsizes {sample_id}" ) j.command(f"cp {output_filename} {j.output_bigWig}") j.command(f"echo Done: {output_file_path}") j.command(f"date") # copy output batch.write_output(j.output_bigWig, output_file_path) print("Output file path: ", output_file_path)
def main(): p = batch_utils.init_arg_parser( default_cpu=4, gsa_key_file=os.path.expanduser( "~/.config/gcloud/misc-270914-cb9992ec9b25.json")) p.add_argument( "--metadata-tsv-path", default=ALL_METADATA_TSV, help="Table with columns: sample_id, bam_path, bai_path, batch") p.add_argument("--counts-tsv-path", default=ALL_COUNTS_TSV_GZ, help="Counts .tsv") g = p.add_mutually_exclusive_group() g.add_argument("--with-gtex", help="Use GTEX controls.", action="store_true") g.add_argument( "--only-gtex", help="Run on just the GTEX control samples to test FP rate.", action="store_true") p.add_argument("batch_name", nargs="+", choices=ANALYSIS_BATCHES.keys(), help="Name of RNA-seq batch to process") args = p.parse_args() if not args.force: hl.init(log="/dev/null", quiet=True) # process samples batch_label = f"OUTRIDER" if args.with_gtex: batch_label += " (with GTEx)" batch_label += ": " batch_label += ','.join(args.batch_name) with batch_utils.run_batch(args, batch_label) as batch: for batch_name in args.batch_name: batch_dict = ANALYSIS_BATCHES[batch_name] batch_tissue = batch_dict['tissue'] batch_sex = batch_dict['sex'] c_vector_of_sample_names = 'c("' + '", "'.join( batch_dict['samples']) + '")' if args.with_gtex: batch_include_GTEX_samples = "TRUE" batch_name += "_with_GTEX" elif args.only_gtex: c_vector_of_sample_names = "c()" batch_include_GTEX_samples = "TRUE" batch_name += "_only_GTEX" else: batch_include_GTEX_samples = "FALSE" batch_name += "_without_GTEX" j = batch_utils.init_job(batch, batch_name, DOCKER_IMAGE if not args.raw else None, args.cpu, args.memory, disk_size=10) batch_utils.switch_gcloud_auth_to_user_account( j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT, GCLOUD_PROJECT) # copy inputs j.command(f"""gsutil -m cp {GENCODE_TXDB} .""") j.command( f"""gsutil -m cp {args.metadata_tsv_path} {args.counts_tsv_path} .""" ) output_file = os.path.join(OUTPUT_BASE_DIR, f"{batch_name}.RDS") if not args.force and hl.hadoop_is_file(output_file): logger.info( f"Output file exists: {output_file} . Skipping {batch_name}..." ) return j.command(f"""time xvfb-run Rscript -e ' # outrider library(OUTRIDER) library(annotables) library(data.table) library(ggplot2) library(ggpubr) library(dplyr) library(purrr) library(ggrepel) library(plotly) library(stringr) library(RColorBrewer) library(ggsci) library(ggplot2) library(gtable) library(grid) library(gridExtra) possibleConfounders = c("tissue", "sex", "stranded", "read_length", "batch") # "RIN" # input tables generated by ~/project__rnaseq/code/rnaseq_methods/pipelines/gagneurlab/metadata/export_gagneur_metadata_table.py # batches generated by ~/project__rnaseq/code/rnaseq_methods/pipelines/gagneurlab/metadata/metadata_notebook.py sampleInfo = fread("{os.path.basename(args.metadata_tsv_path)}") sampleInfo$read_length = as.character(sampleInfo$read_length) GTEX_sampleIds = c() if ({batch_include_GTEX_samples}) {{ if (("{batch_sex}" == "M") || ("{batch_sex}" == "F")) {{ GTEX_sampleIds = sampleInfo[(sampleInfo$sex == "{batch_sex}") & (sampleInfo$tissue == "{batch_tissue}") & grepl("GTEX", sampleInfo$sample_id)]$sample_id }} else {{ GTEX_sampleIds = sampleInfo[(sampleInfo$tissue == "{batch_tissue}") & grepl("GTEX", sampleInfo$sample_id)]$sample_id }} }} sampleLabel = "{batch_name}_" sampleSubset = {c_vector_of_sample_names} sampleSubset = c(sampleSubset, GTEX_sampleIds) print("sampleSubset: ") print(sampleSubset) sampleInfo = sampleInfo[sampleInfo$sample_id %in% sampleSubset] if (nrow(sampleInfo) != length(sampleSubset)) {{ print(paste("ERROR: length(sampleInfo) != length(sampleSubset):", length(sampleInfo), length(sampleSubset))) quit("yes") }} geneReadCounts = fread("{os.path.basename(args.counts_tsv_path)}", select=c("gene_id", sampleSubset)) geneReadCounts = geneReadCounts[!grep("ERCC", geneReadCounts$geneId),] geneIds = geneReadCounts$gene_id colsMiusGeneId = colnames(geneReadCounts)[!colnames(geneReadCounts) %in% c("gene_id")] geneReadCounts = geneReadCounts[,..colsMiusGeneId] rownames(geneReadCounts) = geneIds cnts = as.matrix(geneReadCounts) rownames(cnts) = geneIds ncol(cnts) nrow(cnts) if (ncol(cnts) != length(sampleSubset)) {{ print(paste("ERROR: ncol(cnts) != length(sampleSubset):", ncol(cnts), length(sampleSubset))) quit("yes") }} sampleInfo[,sampleID:=sample_id] ods <- OutriderDataSet(countData=cnts, colData=sampleInfo) txdb <- loadDb("{os.path.basename(GENCODE_TXDB)}") ods <- filterExpression(ods, gtfFile=txdb, filterGenes=FALSE) #, fpkmCutoff=100) g = plotFPKM(ods) + theme_bw() + theme(legend.position="bottom") ggsave(file=paste(sampleLabel, "_plotFPKM.png", sep=""), g, device="png", type="cairo") #plotExpressedGenes(ods) ods <- estimateSizeFactors(ods) sortedSizeFactors = sort(sizeFactors(ods)) g = ggplot(data=NULL, aes(y=sortedSizeFactors, x=1:ncol(ods))) + geom_point(color="blue", size=1) + labs(x="Sample rank", y="Size factors", title="Size factor distribution") + geom_label_repel(aes(label=ifelse(sortedSizeFactors > 1.5, names(sortedSizeFactors), "")), nudge_x = -35, box.padding = 0.35, point.padding = 0.5, segment.color = "grey50") + geom_label_repel(aes(label=ifelse(sortedSizeFactors < 0.5, names(sortedSizeFactors), "")), nudge_x = 35, box.padding = 0.35, point.padding = 0.5, segment.color = "grey50") + theme_bw() ggsave(file=paste(sampleLabel, "_sizeFactors.png", sep=""), g, type="cairo") print(sort(sizeFactors(ods))[1:5]) print(paste(length(ods), "genes before filtering")) ods <- ods[mcols(ods)$passedFilter,] print(paste(length(ods), "genes after filtering")) plotCountCorHeatmap(ods, colGroups=possibleConfounders, normalized=FALSE, device="pdf", type="cairo", nRowCluster=1, nColCluster=1, filename=paste(sampleLabel, "_plotCountCorHeatmap_before_correction.pdf", sep="")) plotCountGeneSampleHeatmap(ods, colGroups=possibleConfounders, normalized=FALSE, device="pdf", type="cairo", filename=paste(sampleLabel, "_plotCountGeneSampleHeatmap_before_correction.pdf", sep="")) if (length(sampleSubset) > 5) {{ ods = findEncodingDim(ods, BPPARAM=MulticoreParam(4, progressbar=TRUE)) g = plotEncDimSearch(ods) ggsave(file=paste(sampleLabel, "_plotEncDimSearch", ".png", sep=""), g, type="cairo") optimal_q = metadata(ods)$opt }} else {{ optimal_q = length(sampleSubset) }} # increase / descrease by 25% q = optimal_q original_ods = ods ods = OUTRIDER(original_ods, verbose=TRUE, iterations=15, q=q, BPPARAM=MulticoreParam(4, progressbar=TRUE)) saveRDS(ods, paste(sampleLabel, "_ods.RDS", sep="")) plotCountCorHeatmap(ods, colGroups=possibleConfounders, normalized=TRUE, device="pdf", type="cairo", nRowCluster=1, nColCluster=1, main=paste("Count correlation heatmap q=", q, sep=""), filename=paste(sampleLabel, "_plotCountCorHeatmap_after_correction.pdf", sep="")) plotCountGeneSampleHeatmap(ods, colGroups=possibleConfounders, normalized=TRUE, device="pdf", type="cairo", main=paste("Count Gene vs Sample Heatmap q=", q, sep=""), device="pdf", type="cairo", filename=paste(sampleLabel, "_plotCountGeneSampleHeatmap_after_correction.pdf", sep="")) res = results(ods, padjCutoff=1) res = res[,c("sampleID", "geneID", "pValue", "padjust", "zScore", "rawcounts")][order(padjust),] res[, "q"] = q write.table(res, file=paste(sampleLabel, "_ods__", "q", q, "_results.tsv", sep=""), quote=FALSE, sep="\\t", row.names=FALSE) '""") j.command("gzip *.tsv") j.command( f"gsutil -m cp *.tsv.gz *.pdf *.png *.RDS {OUTPUT_BASE_DIR}") logger.info(f"Output: {output_file}")
def main(): p = batch_utils.init_arg_parser( default_cpu=4, gsa_key_file=os.path.expanduser( "~/.config/gcloud/misc-270914-cb9992ec9b25.json")) p.add_argument("--with-gtex", help="Use GTEX controls.", action="store_true") p.add_argument("--skip-step1", action="store_true", help="Skip count-split-reads step") p.add_argument("--skip-step2", action="store_true", help="Skip compute-PSI step") p.add_argument("--skip-step3", action="store_true", help="Skip compute-best-Q step") p.add_argument("-m1", "--memory-step1", type=float, help="Batch: (optional) memory in gigabytes (eg. 3.75)", default=3.75) p.add_argument("-m2", "--memory-step2", type=float, help="Batch: (optional) memory in gigabytes (eg. 3.75)", default=3.75) p.add_argument( "--metadata-tsv-path", default=ALL_METADATA_TSV, help="Table with columns: sample_id, bam_path, bai_path, batch") p.add_argument("batch_name", nargs="+", choices=ANALYSIS_BATCHES.keys(), help="Name of RNA-seq batch to process") args = p.parse_args() hl.init(log="/dev/null", quiet=True) with hl.hadoop_open(args.metadata_tsv_path) as f: samples_df_unmodified = pd.read_table(f).set_index("sample_id", drop=False) batch_label = f"FRASER" if args.with_gtex: batch_label += " (with GTEx)" batch_label += ": " batch_label += ','.join(args.batch_name) with batch_utils.run_batch(args, batch_label) as batch: for batch_name in args.batch_name: samples_df = samples_df_unmodified batch_dict = ANALYSIS_BATCHES[batch_name] batch_tissue = batch_dict['tissue'] batch_sex = batch_dict['sex'] sample_ids = list(batch_dict['samples']) if args.with_gtex: batch_name += "_with_GTEX" samples_df_filter = (samples_df.tissue == batch_tissue) samples_df_filter &= samples_df.sample_id.str.startswith( "GTEX") if batch_sex == "M" or batch_sex == "F": samples_df_filter &= (samples_df.sex == batch_sex) sample_ids += list(samples_df[samples_df_filter].sample_id) else: batch_name += "_without_GTEX" samples_df = samples_df.loc[sample_ids] byte_string = ", ".join(sorted(samples_df.sample_id)).encode() h = hashlib.md5(byte_string).hexdigest().upper() sample_set_label = f"{batch_name}__{len(samples_df.sample_id)}_samples_{h[:10]}" logger.info( f"Processing {sample_set_label}: {len(samples_df)} sample ids: {', '.join(samples_df.sample_id[:20])}" ) split_reads_samples = [] split_reads_output_files = [] split_reads_jobs = {} non_split_reads_output_files = [] non_split_reads_jobs = {} j_extract_splice_junctions = None j_calculate_psi_values = None j_calculate_best_q = None # based on docs @ https://bioconductor.org/packages/devel/bioc/vignettes/FRASER/inst/doc/FRASER.pdf # step 1: count spliced reads # step 2: count non-spliced reads at acceptors & donors of splice junctions detected in step 1 for step in 1, 2: for sample_id in samples_df.sample_id: metadata_row = samples_df.loc[sample_id] # set job inputs & outputs input_bam, input_bai = metadata_row[ 'bam_path'], metadata_row['bai_path'] if "GTEX" in sample_id: output_dir_for_sample_specific_data = "gs://macarthurlab-rnaseq/gtex_v8/fraser_count_rna/" else: output_dir_for_sample_specific_data = f"gs://macarthurlab-rnaseq/{metadata_row['batch']}/fraser_count_rna/" output_dir_for_batch_specific_data = f"gs://macarthurlab-rnaseq/gagneur/fraser/results/{sample_set_label}" output_file_path_splice_junctions_RDS = os.path.join( output_dir_for_batch_specific_data, f"spliceJunctions_{sample_set_label}.RDS") output_file_path_calculated_psi_values_tar_gz = os.path.join( output_dir_for_batch_specific_data, f"calculatedPSIValues_{sample_set_label}.tar.gz") output_file_path_calculated_best_q_tar_gz = os.path.join( output_dir_for_batch_specific_data, f"calculatedBestQ_{sample_set_label}.tar.gz") output_file_path_fraser_analysis_tar_gz = os.path.join( output_dir_for_batch_specific_data, f"fraserAnalysis_using_PCA_{sample_set_label}.tar.gz") output_file_path_fraser_analysis_results_only_tar_gz = os.path.join( output_dir_for_batch_specific_data, f"fraserAnalysis_using_PCA_{sample_set_label}_results_only.tar.gz" ) print("Input bam: ", input_bam) if step == 1: output_file_path = os.path.join( output_dir_for_sample_specific_data, f"fraser_count_split_reads_{sample_id}.tar.gz") memory = args.memory_step1 elif step == 2: output_file_path = os.path.join( output_dir_for_batch_specific_data, f"fraser_count_non_split_reads_{sample_id}__{sample_set_label}.tar.gz" ) memory = args.memory_step2 if step == 1: split_reads_samples.append(sample_id) split_reads_output_files.append(output_file_path) elif step == 2: non_split_reads_output_files.append(output_file_path) if (step == 1 and args.skip_step1) or (step == 2 and args.skip_step2): continue # check if output file already exists if not args.force and hl.hadoop_is_file(output_file_path): logger.info( f"{sample_id} output file already exists: {output_file_path}. Skipping..." ) continue if not args.local: file_stats = hl.hadoop_stat(metadata_row['bam_path']) bam_size = int(round(file_stats['size_bytes'] / 10.**9)) disk_size = bam_size * 2 else: disk_size = None job_label = f"Count {'split' if step == 1 else 'non-split'} reads" j = batch_utils.init_job(batch, f"{job_label}: {sample_id}", cpu=args.cpu, memory=memory, disk_size=disk_size, image=DOCKER_IMAGE) batch_utils.switch_gcloud_auth_to_user_account( j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT, GCLOUD_PROJECT) j.command( f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bam} {sample_id}.bam" ) j.command( f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bai} {sample_id}.bam.bai" ) j.command(f"touch {sample_id}.bam.bai") bam_path = f"{sample_id}.bam" j.command(f"pwd && ls -lh && date") if step == 1: # count split reads j.command(f"""time xvfb-run Rscript -e ' library(FRASER) library(data.table) sampleTable = data.table(sampleID=c("{sample_id}"), bamFile=c("{bam_path}")) print(sampleTable) fds = FraserDataSet(colData=sampleTable, workingDir=".", bamParam=ScanBamParam(mapqFilter=0), strandSpecific=0L) getSplitReadCountsForAllSamples(fds) # saves results to cache/ '""") elif step == 2: if sample_id in split_reads_jobs: j.depends_on(split_reads_jobs[sample_id]) if j_extract_splice_junctions: j.depends_on(j_extract_splice_junctions) j.command( f"gsutil -m cp {output_file_path_splice_junctions_RDS} ." ) # count non-split reads j.command(f"""time xvfb-run Rscript -e ' library(FRASER) library(data.table) spliceJunctions = readRDS("{os.path.basename(output_file_path_splice_junctions_RDS)}") sampleTable = data.table(sampleID=c("{sample_id}"), bamFile=c("{bam_path}")) print(sampleTable) fds = FraserDataSet(colData=sampleTable, workingDir=".", bamParam=ScanBamParam(mapqFilter=0), strandSpecific=0L) getNonSplitReadCountsForAllSamples(fds, spliceJunctions) # saves results to cache/ '""") j.command(f"ls -lh .") j.command( f"tar czf {os.path.basename(output_file_path)} cache") j.command( f"gsutil -m cp {os.path.basename(output_file_path)} {output_file_path}" ) j.command(f"echo Done: {output_file_path}") j.command(f"date") print("Output file path: ", output_file_path) if step == 1: split_reads_jobs[sample_id] = j elif step == 2: non_split_reads_jobs[sample_id] = j if len(split_reads_output_files) == 0: break if step == 1 and not args.skip_step1: if hl.hadoop_is_file(output_file_path_splice_junctions_RDS ) and not args.force: logger.info( f"{output_file_path_splice_junctions_RDS} file already exists. Skipping extractSpliceJunctions step..." ) continue j_extract_splice_junctions = batch_utils.init_job( batch, f"{sample_set_label}: Extract splice-junctions", disk_size=30, memory=60, image=DOCKER_IMAGE) for j in split_reads_jobs.values(): j_extract_splice_junctions.depends_on(j) extract_splice_junctions( j_extract_splice_junctions, split_reads_output_files, args.cpu, output_file_path_splice_junctions_RDS) elif step == 2 and not args.skip_step2: if hl.hadoop_is_file( output_file_path_calculated_psi_values_tar_gz ) and not args.force: logger.info( f"{output_file_path_calculated_psi_values_tar_gz} file already exists. Skipping calculatePSIValues step..." ) continue num_cpu = 4 if args.local else 16 memory = 60 j_calculate_psi_values = batch_utils.init_job( batch, f"{sample_set_label}: Calculate PSI values", disk_size=50, cpu=num_cpu, memory=memory, image=DOCKER_IMAGE) if j_extract_splice_junctions: j_calculate_psi_values.depends_on( j_extract_splice_junctions) for j in non_split_reads_jobs.values(): j_calculate_psi_values.depends_on(j) calculate_psi_values( j_calculate_psi_values, sample_set_label, split_reads_output_files, non_split_reads_output_files, output_file_path_splice_junctions_RDS, args.metadata_tsv_path, num_cpu, output_file_path_calculated_psi_values_tar_gz) # compute Best Q if args.skip_step3: logger.info(f"Skipping calculatedBestQ step...") elif hl.hadoop_is_file(output_file_path_calculated_best_q_tar_gz ) and not args.force: logger.info( f"{output_file_path_calculated_best_q_tar_gz} file already exists. Skipping calculatedBestQ step..." ) else: num_cpu = 4 if args.local else 16 memory = 3.75 * num_cpu j_calculate_best_q = batch_utils.init_job( batch, f"{sample_set_label}: Calculate Best Q", disk_size=50, cpu=num_cpu, memory=memory, image=DOCKER_IMAGE) if j_calculate_psi_values: j_calculate_best_q.depends_on(j_calculate_psi_values) calculate_best_q( j_calculate_best_q, sample_set_label, 4, output_file_path_calculated_psi_values_tar_gz, output_file_path_calculated_best_q_tar_gz) # output_file_path_fraser_analysis_tar_gz if hl.hadoop_is_file( output_file_path_fraser_analysis_results_only_tar_gz ) and not args.force: logger.info( f"{output_file_path_fraser_analysis_results_only_tar_gz} file already exists. Skipping run_fraser_analysis step..." ) else: num_cpu = 4 if args.local else 16 memory = 3.75 * num_cpu j_fraser_analysis = batch_utils.init_job( batch, f"{sample_set_label}: Run Fraser Analysis", disk_size=50, cpu=num_cpu, memory=memory, image=DOCKER_IMAGE) if j_calculate_best_q: j_fraser_analysis.depends_on(j_calculate_best_q) run_fraser_analysis( j_fraser_analysis, sample_set_label, 4, output_file_path_calculated_best_q_tar_gz, output_file_path_fraser_analysis_tar_gz, output_file_path_fraser_analysis_results_only_tar_gz)