def test_hadoop_stat(self): path1 = resource('ls_test') stat1 = hl.hadoop_stat(path1) self.assertEqual(stat1['is_dir'], True) path2 = resource('ls_test/f_50') stat2 = hl.hadoop_stat(path2) self.assertEqual(stat2['size_bytes'], 50) self.assertEqual(stat2['is_dir'], False) self.assertTrue('path' in stat2)
def test_hadoop_stat(self): stat1 = hl.hadoop_stat(f'{BUCKET}/') self.assertEqual(stat1['is_dir'], True) stat2 = hl.hadoop_stat(f'{BUCKET}/test_out.copy.txt.gz') self.assertEqual(stat2['size_bytes'], 302) self.assertEqual(stat2['is_dir'], False) self.assertTrue('path' in stat2) self.assertTrue('owner' in stat2) self.assertTrue('modification_time' in stat2)
def test_hadoop_stat(self): path1 = resource('ls_test') stat1 = hl.hadoop_stat(path1) self.assertEqual(stat1['is_dir'], True) path2 = resource('ls_test/f_50') stat2 = hl.hadoop_stat(path2) self.assertEqual(stat2['size_bytes'], 50) self.assertEqual(stat2['is_dir'], False) self.assertTrue('path' in stat2) self.assertTrue('owner' in stat2) self.assertTrue('modification_time' in stat2)
def test_hadoop_stat(self, bucket=None): if bucket is None: bucket = self.remote_bucket stat1 = hl.hadoop_stat(f'{bucket}/') self.assertEqual(stat1['is_dir'], True) stat2 = hl.hadoop_stat(f'{bucket}/test_out.copy.txt.gz') self.assertEqual(stat2['size_bytes'], 302) self.assertEqual(stat2['is_dir'], False) self.assertTrue('path' in stat2) self.assertTrue('owner' in stat2) self.assertTrue('modification_time' in stat2)
def test_hadoop_stat(self, prefix: Optional[str] = None): if prefix is None: prefix = self.remote_tmpdir data = ['foo', 'bar', 'baz'] data.extend(map(str, range(100))) with hadoop_open(f'{prefix}/test_hadoop_stat.txt.gz', 'w') as f: for d in data: f.write(d) f.write('\n') stat1 = hl.hadoop_stat(f'{prefix}') self.assertEqual(stat1['is_dir'], True) hadoop_copy(f'{prefix}/test_hadoop_stat.txt.gz', f'{prefix}/test_hadoop_stat.copy.txt.gz') stat2 = hl.hadoop_stat(f'{prefix}/test_hadoop_stat.copy.txt.gz') # The gzip format permits metadata which makes the compressed file's size unpredictable. In # practice, Hadoop creates a 175 byte file and gzip.GzipFile creates a 202 byte file. The 27 # extra bytes appear to include at least the filename (20 bytes) and a modification timestamp. assert stat2['size_bytes'] == 175 or stat2['size_bytes'] == 202 self.assertEqual(stat2['is_dir'], False) self.assertTrue('path' in stat2)
def test_hadoop_copy_log(self): with with_local_temp_file('log') as r: hl.copy_log(r) stats = hl.hadoop_stat(r) self.assertTrue(stats['size_bytes'] > 0)
def test_hadoop_copy_log(self): r = resource('copy_log_test.txt') hl.copy_log(r) stats = hl.hadoop_stat(r) self.assertTrue(stats['size_bytes'] > 0)
def modified_time(self, path): # pylint: disable=no-self-use stat = hl.hadoop_stat(path) return datetime.datetime.strptime(stat["modification_time"], "%a %b %d %H:%M:%S %Z %Y")
def main(): rnaseq_sample_metadata_df = get_joined_metadata_df() #gtex_rnaseq_sample_metadata_df = get_gtex_rnaseq_sample_metadata_df() p = batch_utils.init_arg_parser( default_cpu=4, gsa_key_file=os.path.expanduser( "~/.config/gcloud/misc-270914-cb9992ec9b25.json")) grp = p.add_mutually_exclusive_group(required=True) grp.add_argument( "-b", "--rnaseq-batch-name", nargs="*", help="RNA-seq batch names to process (eg. -b batch1 batch2)", choices=set(rnaseq_sample_metadata_df['star_pipeline_batch']) | set(["gtex_muscle", "gtex_fibroblasts", "gtex_blood"])) grp.add_argument( "-s", "--rnaseq-sample-id", nargs="*", help="RNA-seq sample IDs to process (eg. -s sample1 sample2)", choices=set(rnaseq_sample_metadata_df['sample_id']) | set([ 'GTEX-1LG7Z-0005-SM-DKPQ6', 'GTEX-PX3G-0006-SM-5SI7E', 'GTEX-1KXAM-0005-SM-DIPEC' ])) args = p.parse_args() # Generate samples_df with these columns: sample_id, bam_path, bai_path, output_dir, batch_name, sex, RIN, ancestry, etc. samples_df = pd.DataFrame() if args.rnaseq_batch_name: for batch_name in args.rnaseq_batch_name: df = rnaseq_sample_metadata_df[ rnaseq_sample_metadata_df['star_pipeline_batch'] == batch_name] samples_df = transfer_metadata_columns_from_df(samples_df, df) elif args.rnaseq_sample_id: df = rnaseq_sample_metadata_df[ rnaseq_sample_metadata_df.sample_id.isin(set( args.rnaseq_sample_id))] samples_df = transfer_metadata_columns_from_df(samples_df, df) else: p.error("Must specify -b or -s") logger.info( f"Processing {len(samples_df)} sample ids: {', '.join(samples_df.sample_id[:20])}" ) # see https://hail.zulipchat.com/#narrow/stream/223457-Batch-support/topic/auth.20as.20user.20account for more details with batch_utils.run_batch(args) as batch: for sample_id in samples_df.sample_id: metadata_row = samples_df.loc[sample_id] # set job inputs & outputs input_bam, input_bai = metadata_row['bam_path'], metadata_row[ 'bai_path'] output_dir = metadata_row['output_dir'] print("Input bam: ", input_bam) output_filename = f"{sample_id}.bigWig" output_file_path = os.path.join(output_dir, output_filename) # check if output file already exists if hl.hadoop_is_file(output_file_path) and not args.force: logger.info( f"{sample_id} output file already exists: {output_file_path}. Skipping..." ) continue file_stats = hl.hadoop_stat(metadata_row['bam_path']) bam_size = int(round(file_stats['size_bytes'] / 10.**9)) disk_size = bam_size * 2 j = batch_utils.init_job(batch, f"bam=>bigWig: {sample_id}", cpu=args.cpu, memory=args.memory, disk_size=disk_size, image=DOCKER_IMAGE) batch_utils.switch_gcloud_auth_to_user_account( j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT, GCLOUD_PROJECT) j.command( f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bam} {sample_id}.bam" ) j.command( f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bai} {sample_id}.bam.bai" ) j.command( f"gsutil -u {GCLOUD_PROJECT} -m cp gs://gtex-resources/references/GRCh38.chrsizes ." ) j.command(f"touch {sample_id}.bam.bai") j.command(f"pwd && ls && date") j.command( f"python3 /src/bam2coverage.py {sample_id}.bam GRCh38.chrsizes {sample_id}" ) j.command(f"cp {output_filename} {j.output_bigWig}") j.command(f"echo Done: {output_file_path}") j.command(f"date") # copy output batch.write_output(j.output_bigWig, output_file_path) print("Output file path: ", output_file_path)
def main(): rnaseq_sample_metadata_df = get_joined_metadata_df() p = argparse.ArgumentParser() grp = p.add_mutually_exclusive_group(required=True) grp.add_argument("--local", action="store_true", help="Batch: run locally") grp.add_argument("--cluster", action="store_true", help="Batch: submit to cluster") p.add_argument( "--batch-billing-project", default="tgg-rare-disease", help="Batch: billing project. Required if submitting to cluster.") p.add_argument("--batch-job-name", help="Batch: (optional) job name") p.add_argument( "-f", "--force", action="store_true", help="Recompute and overwrite cached or previously computed data") grp = p.add_mutually_exclusive_group(required=True) grp.add_argument("-b", "--rnaseq-batch-name", nargs="*", help="RNA-seq batch names to process", choices=set( rnaseq_sample_metadata_df['star_pipeline_batch'])) grp.add_argument("-s", "--rnaseq-sample-id", nargs="*", help="RNA-seq sample IDs to process", choices=set(rnaseq_sample_metadata_df['sample_id'])) args = p.parse_args() #logger.info("\n".join(df.columns)) if args.rnaseq_batch_name: batch_names = args.rnaseq_batch_name sample_ids = rnaseq_sample_metadata_df[rnaseq_sample_metadata_df[ 'star_pipeline_batch'].isin(batch_names)].sample_id elif args.rnaseq_sample_id: sample_ids = args.rnaseq_sample_id logger.info( f"Processing {len(sample_ids)} sample ids: {', '.join(sample_ids)}") # see https://hail.zulipchat.com/#narrow/stream/223457-Batch-support/topic/auth.20as.20user.20account for more details if args.local: backend = hb.LocalBackend(gsa_key_file=os.path.expanduser( "~/.config/gcloud/misc-270914-cb9992ec9b25.json")) else: backend = hb.ServiceBackend(args.batch_billing_project) b = hb.Batch(backend=backend, name=args.batch_job_name) # define workflow inputs if args.local: genes_gtf = b.read_input("gencode.v26.annotation.gff3", extension=".gff3") else: genes_gtf = b.read_input( "gs://macarthurlab-rnaseq/ref/gencode.v26.annotation.GRCh38.gff3", extension=".gff3") # define parallel execution for samples for sample_id in sample_ids: metadata_row = rnaseq_sample_metadata_df.loc[sample_id] batch_name = metadata_row['star_pipeline_batch'] # set job inputs & outputs input_read_data = b.read_input_group( bam=metadata_row['star_bam'], bai=metadata_row['star_bai'], ) output_dir = f"gs://macarthurlab-rnaseq/{batch_name}/majiq_build/" output_file_path = os.path.join(output_dir, f"majiq_build_{sample_id}.tar.gz") # check if output file already exists if hl.hadoop_is_file(output_file_path) and not args.force: logger.info( f"{sample_id} output file already exists: {output_file_path}. Skipping..." ) continue file_stats = hl.hadoop_stat(metadata_row['star_bam']) bam_size = int(round(file_stats['size_bytes'] / 10.**9)) # define majiq build commands for this sample j = b.new_job(name=args.batch_job_name) j.image("weisburd/majiq:latest") j.storage(f'{bam_size*3}Gi') j.cpu(1) # default: 1 j.memory("15G") # default: 3.75G logger.info( f'Requesting: {j._storage or "default"} storage, {j._cpu or "default"} CPU, {j._memory or "default"} memory' ) # switch to user account j.command( f"gcloud auth activate-service-account --key-file /gsa-key/key.json" ) j.command( f"gsutil -m cp -r {GCLOUD_CREDENTIALS_LOCATION}/.config /tmp/") j.command(f"rm -rf ~/.config") j.command(f"mv /tmp/.config ~/") j.command(f"gcloud config set account {GCLOUD_USER_ACCOUNT}") j.command(f"gcloud config set project {GCLOUD_PROJECT}") # run majiq build #j.command(f"gsutil -u {GCLOUD_PROJECT} -m cp gs://gtex-resources/GENCODE/gencode.v26.GRCh38.ERCC.genes.collapsed_only.gtf .") j.command(f"mv {genes_gtf} gencode.gff3") j.command(f"mv {input_read_data.bam} {sample_id}.bam") j.command(f"mv {input_read_data.bai} {sample_id}.bam.bai") j.command(f"echo '[info]' >> majiq_build.cfg") j.command( f"echo 'readlen={metadata_row['read length (rnaseqc)']}' >> majiq_build.cfg" ) j.command(f"echo 'bamdirs=.' >> majiq_build.cfg") j.command(f"echo 'genome=hg38' >> majiq_build.cfg") j.command( f"echo 'strandness={'None' if metadata_row['stranded? (rnaseqc)'] == 'no' else 'reverse'}' >> majiq_build.cfg" ) j.command(f"echo '[experiments]' >> majiq_build.cfg") j.command(f"echo '{sample_id}={sample_id}' >> majiq_build.cfg") j.command(f"cat majiq_build.cfg >> {j.logfile}") j.command( f"majiq build gencode.gff3 -c majiq_build.cfg -j 1 -o majiq_build_{sample_id} >> {j.logfile}" ) j.command( f"tar czf majiq_build_{sample_id}.tar.gz majiq_build_{sample_id}") j.command(f"cp majiq_build_{sample_id}.tar.gz {j.output_tar_gz}") #j.command(f"ls -lh . >> {j.logfile}") #j.command(f"echo ls majiq_build_{sample_id} >> {j.logfile}") #j.command(f"ls -1 majiq_build_{sample_id} >> {j.logfile}") j.command(f"echo --- done {output_file_path} >> {j.logfile}") # copy output b.write_output(j.output_tar_gz, output_file_path) b.write_output( j.logfile, os.path.join(output_dir, f"majiq_build_{sample_id}.log")) b.run() if isinstance(backend, hb.ServiceBackend): backend.close()
def main(): p = batch_utils.init_arg_parser( default_cpu=4, gsa_key_file=os.path.expanduser( "~/.config/gcloud/misc-270914-cb9992ec9b25.json")) p.add_argument("--with-gtex", help="Use GTEX controls.", action="store_true") p.add_argument("--skip-step1", action="store_true", help="Skip count-split-reads step") p.add_argument("--skip-step2", action="store_true", help="Skip compute-PSI step") p.add_argument("--skip-step3", action="store_true", help="Skip compute-best-Q step") p.add_argument("-m1", "--memory-step1", type=float, help="Batch: (optional) memory in gigabytes (eg. 3.75)", default=3.75) p.add_argument("-m2", "--memory-step2", type=float, help="Batch: (optional) memory in gigabytes (eg. 3.75)", default=3.75) p.add_argument( "--metadata-tsv-path", default=ALL_METADATA_TSV, help="Table with columns: sample_id, bam_path, bai_path, batch") p.add_argument("batch_name", nargs="+", choices=ANALYSIS_BATCHES.keys(), help="Name of RNA-seq batch to process") args = p.parse_args() hl.init(log="/dev/null", quiet=True) with hl.hadoop_open(args.metadata_tsv_path) as f: samples_df_unmodified = pd.read_table(f).set_index("sample_id", drop=False) batch_label = f"FRASER" if args.with_gtex: batch_label += " (with GTEx)" batch_label += ": " batch_label += ','.join(args.batch_name) with batch_utils.run_batch(args, batch_label) as batch: for batch_name in args.batch_name: samples_df = samples_df_unmodified batch_dict = ANALYSIS_BATCHES[batch_name] batch_tissue = batch_dict['tissue'] batch_sex = batch_dict['sex'] sample_ids = list(batch_dict['samples']) if args.with_gtex: batch_name += "_with_GTEX" samples_df_filter = (samples_df.tissue == batch_tissue) samples_df_filter &= samples_df.sample_id.str.startswith( "GTEX") if batch_sex == "M" or batch_sex == "F": samples_df_filter &= (samples_df.sex == batch_sex) sample_ids += list(samples_df[samples_df_filter].sample_id) else: batch_name += "_without_GTEX" samples_df = samples_df.loc[sample_ids] byte_string = ", ".join(sorted(samples_df.sample_id)).encode() h = hashlib.md5(byte_string).hexdigest().upper() sample_set_label = f"{batch_name}__{len(samples_df.sample_id)}_samples_{h[:10]}" logger.info( f"Processing {sample_set_label}: {len(samples_df)} sample ids: {', '.join(samples_df.sample_id[:20])}" ) split_reads_samples = [] split_reads_output_files = [] split_reads_jobs = {} non_split_reads_output_files = [] non_split_reads_jobs = {} j_extract_splice_junctions = None j_calculate_psi_values = None j_calculate_best_q = None # based on docs @ https://bioconductor.org/packages/devel/bioc/vignettes/FRASER/inst/doc/FRASER.pdf # step 1: count spliced reads # step 2: count non-spliced reads at acceptors & donors of splice junctions detected in step 1 for step in 1, 2: for sample_id in samples_df.sample_id: metadata_row = samples_df.loc[sample_id] # set job inputs & outputs input_bam, input_bai = metadata_row[ 'bam_path'], metadata_row['bai_path'] if "GTEX" in sample_id: output_dir_for_sample_specific_data = "gs://macarthurlab-rnaseq/gtex_v8/fraser_count_rna/" else: output_dir_for_sample_specific_data = f"gs://macarthurlab-rnaseq/{metadata_row['batch']}/fraser_count_rna/" output_dir_for_batch_specific_data = f"gs://macarthurlab-rnaseq/gagneur/fraser/results/{sample_set_label}" output_file_path_splice_junctions_RDS = os.path.join( output_dir_for_batch_specific_data, f"spliceJunctions_{sample_set_label}.RDS") output_file_path_calculated_psi_values_tar_gz = os.path.join( output_dir_for_batch_specific_data, f"calculatedPSIValues_{sample_set_label}.tar.gz") output_file_path_calculated_best_q_tar_gz = os.path.join( output_dir_for_batch_specific_data, f"calculatedBestQ_{sample_set_label}.tar.gz") output_file_path_fraser_analysis_tar_gz = os.path.join( output_dir_for_batch_specific_data, f"fraserAnalysis_using_PCA_{sample_set_label}.tar.gz") output_file_path_fraser_analysis_results_only_tar_gz = os.path.join( output_dir_for_batch_specific_data, f"fraserAnalysis_using_PCA_{sample_set_label}_results_only.tar.gz" ) print("Input bam: ", input_bam) if step == 1: output_file_path = os.path.join( output_dir_for_sample_specific_data, f"fraser_count_split_reads_{sample_id}.tar.gz") memory = args.memory_step1 elif step == 2: output_file_path = os.path.join( output_dir_for_batch_specific_data, f"fraser_count_non_split_reads_{sample_id}__{sample_set_label}.tar.gz" ) memory = args.memory_step2 if step == 1: split_reads_samples.append(sample_id) split_reads_output_files.append(output_file_path) elif step == 2: non_split_reads_output_files.append(output_file_path) if (step == 1 and args.skip_step1) or (step == 2 and args.skip_step2): continue # check if output file already exists if not args.force and hl.hadoop_is_file(output_file_path): logger.info( f"{sample_id} output file already exists: {output_file_path}. Skipping..." ) continue if not args.local: file_stats = hl.hadoop_stat(metadata_row['bam_path']) bam_size = int(round(file_stats['size_bytes'] / 10.**9)) disk_size = bam_size * 2 else: disk_size = None job_label = f"Count {'split' if step == 1 else 'non-split'} reads" j = batch_utils.init_job(batch, f"{job_label}: {sample_id}", cpu=args.cpu, memory=memory, disk_size=disk_size, image=DOCKER_IMAGE) batch_utils.switch_gcloud_auth_to_user_account( j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT, GCLOUD_PROJECT) j.command( f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bam} {sample_id}.bam" ) j.command( f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bai} {sample_id}.bam.bai" ) j.command(f"touch {sample_id}.bam.bai") bam_path = f"{sample_id}.bam" j.command(f"pwd && ls -lh && date") if step == 1: # count split reads j.command(f"""time xvfb-run Rscript -e ' library(FRASER) library(data.table) sampleTable = data.table(sampleID=c("{sample_id}"), bamFile=c("{bam_path}")) print(sampleTable) fds = FraserDataSet(colData=sampleTable, workingDir=".", bamParam=ScanBamParam(mapqFilter=0), strandSpecific=0L) getSplitReadCountsForAllSamples(fds) # saves results to cache/ '""") elif step == 2: if sample_id in split_reads_jobs: j.depends_on(split_reads_jobs[sample_id]) if j_extract_splice_junctions: j.depends_on(j_extract_splice_junctions) j.command( f"gsutil -m cp {output_file_path_splice_junctions_RDS} ." ) # count non-split reads j.command(f"""time xvfb-run Rscript -e ' library(FRASER) library(data.table) spliceJunctions = readRDS("{os.path.basename(output_file_path_splice_junctions_RDS)}") sampleTable = data.table(sampleID=c("{sample_id}"), bamFile=c("{bam_path}")) print(sampleTable) fds = FraserDataSet(colData=sampleTable, workingDir=".", bamParam=ScanBamParam(mapqFilter=0), strandSpecific=0L) getNonSplitReadCountsForAllSamples(fds, spliceJunctions) # saves results to cache/ '""") j.command(f"ls -lh .") j.command( f"tar czf {os.path.basename(output_file_path)} cache") j.command( f"gsutil -m cp {os.path.basename(output_file_path)} {output_file_path}" ) j.command(f"echo Done: {output_file_path}") j.command(f"date") print("Output file path: ", output_file_path) if step == 1: split_reads_jobs[sample_id] = j elif step == 2: non_split_reads_jobs[sample_id] = j if len(split_reads_output_files) == 0: break if step == 1 and not args.skip_step1: if hl.hadoop_is_file(output_file_path_splice_junctions_RDS ) and not args.force: logger.info( f"{output_file_path_splice_junctions_RDS} file already exists. Skipping extractSpliceJunctions step..." ) continue j_extract_splice_junctions = batch_utils.init_job( batch, f"{sample_set_label}: Extract splice-junctions", disk_size=30, memory=60, image=DOCKER_IMAGE) for j in split_reads_jobs.values(): j_extract_splice_junctions.depends_on(j) extract_splice_junctions( j_extract_splice_junctions, split_reads_output_files, args.cpu, output_file_path_splice_junctions_RDS) elif step == 2 and not args.skip_step2: if hl.hadoop_is_file( output_file_path_calculated_psi_values_tar_gz ) and not args.force: logger.info( f"{output_file_path_calculated_psi_values_tar_gz} file already exists. Skipping calculatePSIValues step..." ) continue num_cpu = 4 if args.local else 16 memory = 60 j_calculate_psi_values = batch_utils.init_job( batch, f"{sample_set_label}: Calculate PSI values", disk_size=50, cpu=num_cpu, memory=memory, image=DOCKER_IMAGE) if j_extract_splice_junctions: j_calculate_psi_values.depends_on( j_extract_splice_junctions) for j in non_split_reads_jobs.values(): j_calculate_psi_values.depends_on(j) calculate_psi_values( j_calculate_psi_values, sample_set_label, split_reads_output_files, non_split_reads_output_files, output_file_path_splice_junctions_RDS, args.metadata_tsv_path, num_cpu, output_file_path_calculated_psi_values_tar_gz) # compute Best Q if args.skip_step3: logger.info(f"Skipping calculatedBestQ step...") elif hl.hadoop_is_file(output_file_path_calculated_best_q_tar_gz ) and not args.force: logger.info( f"{output_file_path_calculated_best_q_tar_gz} file already exists. Skipping calculatedBestQ step..." ) else: num_cpu = 4 if args.local else 16 memory = 3.75 * num_cpu j_calculate_best_q = batch_utils.init_job( batch, f"{sample_set_label}: Calculate Best Q", disk_size=50, cpu=num_cpu, memory=memory, image=DOCKER_IMAGE) if j_calculate_psi_values: j_calculate_best_q.depends_on(j_calculate_psi_values) calculate_best_q( j_calculate_best_q, sample_set_label, 4, output_file_path_calculated_psi_values_tar_gz, output_file_path_calculated_best_q_tar_gz) # output_file_path_fraser_analysis_tar_gz if hl.hadoop_is_file( output_file_path_fraser_analysis_results_only_tar_gz ) and not args.force: logger.info( f"{output_file_path_fraser_analysis_results_only_tar_gz} file already exists. Skipping run_fraser_analysis step..." ) else: num_cpu = 4 if args.local else 16 memory = 3.75 * num_cpu j_fraser_analysis = batch_utils.init_job( batch, f"{sample_set_label}: Run Fraser Analysis", disk_size=50, cpu=num_cpu, memory=memory, image=DOCKER_IMAGE) if j_calculate_best_q: j_fraser_analysis.depends_on(j_calculate_best_q) run_fraser_analysis( j_fraser_analysis, sample_set_label, 4, output_file_path_calculated_best_q_tar_gz, output_file_path_fraser_analysis_tar_gz, output_file_path_fraser_analysis_results_only_tar_gz)
def main(): p, args = parse_args() df = pd.read_table(args.cram_and_tsv_paths_table) if {"sample_id", "output_bamout_bam", "output_bamout_bai", "variants_tsv_bgz"} - set(df.columns): p.error(f"{args.tsv_path} must contain 'sample_id', 'output_bamout_bam', 'variants_tsv_bgz' columns") if args.num_samples_to_process: if args.random: df = df.sample(n=args.num_samples_to_process) else: df = df.iloc[:args.num_samples_to_process] if args.sample_to_process: df = df[df.sample_id.isin(set(args.sample_to_process))] logging.info(f"Processing {len(df)} samples") # check that all buckets are in "US-CENTRAL1" or are multi-regional to avoid egress charges to the Batch cluster batch_utils.set_gcloud_project(GCLOUD_PROJECT) with open("deidentify_bamout.py", "rt") as f: deidentify_bamouts_script = f.read() # process sample(s) if not args.sample_to_process and not args.num_samples_to_process: # if processing entire table, listing all files up front ends up being faster existing_deidentify_output_bams = subprocess.check_output(f"gsutil -m ls {args.output_dir}/*.deidentify_output.bam", shell=True, encoding="UTF-8").strip().split("\n") existing_deidentify_output_sorted_bams = subprocess.check_output(f"gsutil -m ls {args.output_dir}/*.deidentify_output.sorted.bam", shell=True, encoding="UTF-8").strip().split("\n") hl.init(log="/dev/null") with batch_utils.run_batch(args, batch_name=f"deidentify bamouts: {len(df)} samples") as batch: for _, row in tqdm.tqdm(df.iterrows(), unit=" samples"): output_bam_path = os.path.join(args.output_dir, f"{row.sample_id}.deidentify_output.bam") output_sorted_bam_path = os.path.join(args.output_dir, f"{row.sample_id}.deidentify_output.sorted.bam") if args.sample_to_process or args.num_samples_to_process: run_deidentify = args.force or not hl.hadoop_is_file(output_bam_path) run_sort = run_deidentify or not hl.hadoop_is_file(output_sorted_bam_path) else: run_deidentify = args.force or output_bam_path not in existing_deidentify_output_bams run_sort = run_deidentify or output_sorted_bam_path not in existing_deidentify_output_sorted_bams if run_deidentify or run_sort: bamout_stat = hl.hadoop_stat(row.output_bamout_bam) cpu = 0.25 if bamout_stat['size_bytes'] > 0.25 * 20_000_000_000: cpu = 0.5 if bamout_stat['size_bytes'] > 0.5 * 20_000_000_000: cpu = 1 if bamout_stat['size_bytes'] > 1 * 20_000_000_000: cpu = 2 if run_deidentify: j = batch_utils.init_job(batch, f"{row.sample_id} - deidentify - cpu:{cpu}", DOCKER_IMAGE if not args.raw else None, cpu=cpu, disk_size=21*cpu) batch_utils.switch_gcloud_auth_to_user_account(j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT) local_tsv_path = batch_utils.localize_file(j, row.variants_tsv_bgz, use_gcsfuse=True) local_exclude_tsv_path = batch_utils.localize_file(j, row.exclude_variants_tsv_bgz, use_gcsfuse=True) local_bamout_path = batch_utils.localize_file(j, row.output_bamout_bam, use_gcsfuse=True) batch_utils.localize_file(j, row.output_bamout_bai, use_gcsfuse=True) j.command(f"""echo -------------- echo "Start - time: $(date)" df -kh cat <<EOF > deidentify_bamout.py {deidentify_bamouts_script} EOF time python3 deidentify_bamout.py -x "{local_exclude_tsv_path}" "{row.sample_id}" "{local_bamout_path}" "{local_tsv_path}" ls -lh gsutil -m cp "{row.sample_id}.deidentify_output.bam" {args.output_dir}/ gsutil -m cp "{row.sample_id}.deidentify_output.db" {args.output_dir}/ echo --------------; free -h; df -kh; uptime; set +xe; echo "Done - time: $(date)"; echo -------------- """) else: logger.info(f"Skipping deidentify {row.sample_id}...") if run_sort: j2 = batch_utils.init_job(batch, f"{row.sample_id} - sort - cpu:{cpu}", DOCKER_IMAGE if not args.raw else None, cpu=cpu) batch_utils.switch_gcloud_auth_to_user_account(j2, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT) if run_deidentify: j2.depends_on(j) local_bamout_path = batch_utils.localize_file(j2, output_bam_path, use_gcsfuse=True) j2.command(f"""echo -------------- echo "Start - time: $(date)" df -kh samtools sort -o "{row.sample_id}.deidentify_output.sorted.bam" "{local_bamout_path}" samtools index "{row.sample_id}.deidentify_output.sorted.bam" ls -lh gsutil -m cp "{row.sample_id}.deidentify_output.sorted.bam" {args.output_dir}/ gsutil -m cp "{row.sample_id}.deidentify_output.sorted.bam.bai" {args.output_dir}/ echo --------------; free -h; df -kh; uptime; set +xe; echo "Done - time: $(date)"; echo -------------- """) elif run_sort: logger.info(f"Sorted output files exist (eg. {output_sorted_bam_path}). Skipping sort for {row.sample_id}...")