def main(): p = batch_utils.init_arg_parser(default_cpu=0.5, default_memory=1.75, gsa_key_file=os.path.expanduser("~/.config/gcloud/misc-270914-cb9992ec9b25.json")) p.add_argument("tsv_path", help="Table with header: sample_id, cram_path, crai_path") p.add_argument("sample_id", nargs="*", help="(optional) 1 or more sample_ids to process. If not specified, all rows in the .tsv will be processed.") args = p.parse_args() df = pd.read_table(args.tsv_path) if {"sample_id", "cram_path", "crai_path"} - set(df.columns): p.error(f"{args.tsv_path} must contain a 'sample_id', 'cram_path', 'crai_path' columns") if not args.force: hl.init(log="/dev/null", quiet=True) # process samples with batch_utils.run_batch(args, batch_name=f"extract chrM") as batch: for _, row in df.iterrows(): if args.sample_id and row.sample_id not in set(args.sample_id): continue input_filename = os.path.basename(row.cram_path) prefix = input_filename.replace(".bam", "").replace(".cram", "") output_cram_path = os.path.join(OUTPUT_DIR, f"{prefix}.chrM.cram") output_crai_path = os.path.join(OUTPUT_DIR, f"{prefix}.chrM.cram.crai") if not args.force and hl.hadoop_is_file(output_cram_path) and hl.hadoop_is_file(output_crai_path): logger.info(f"Output files exist (eg. {output_cram_path}). Skipping {input_filename}...") continue j = batch_utils.init_job(batch, f"chrM: {row.sample_id}", DOCKER_IMAGE if not args.raw else None, args.cpu, args.memory) batch_utils.switch_gcloud_auth_to_user_account(j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT, GCLOUD_PROJECT) # copy inputs REF_PATHS = batch_utils.HG38_REF_PATHS fasta_filename = os.path.basename(parse.urlparse(REF_PATHS.fasta).path) j.command(f"""set -ex env gsutil -m cp {REF_PATHS.fasta} {REF_PATHS.fai} {REF_PATHS.dict} . java -Xms2g -jar /gatk.jar PrintReads \ -R {fasta_filename} \ -I {row.cram_path} \ --read-index {row.crai_path} \ -L chrM \ --gcs-project-for-requester-pays broad-mpg-gnomad \ -O {prefix}.chrM.bam samtools view -C -T {fasta_filename} {prefix}.chrM.bam > {prefix}.chrM.cram samtools index {prefix}.chrM.cram {prefix}.chrM.cram.crai gsutil -m cp {prefix}.chrM.cram.crai {output_crai_path} gsutil -m cp {prefix}.chrM.cram {output_cram_path} """) logger.info(f"Submitted {row.sample_id}: {output_cram_path}")
def main(): p = batch_utils.init_arg_parser(default_cpu=1, gsa_key_file=os.path.expanduser("~/.config/gcloud/misc-270914-cb9992ec9b25.json")) args = p.parse_args() # process samples with batch_utils.run_batch(args, "test") as batch: for cpu in (0.25, 0.5, 1, 2): args.cpu = cpu j = batch.new_job(f"test - {args.cpu} cpu") j.image(DOCKER_IMAGE) j.cpu(args.cpu) j.memory(args.cpu*3.75) #batch_utils.switch_gcloud_auth_to_user_account(j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT, GCLOUD_PROJECT) #j.command(f"yes > data.txt || true") j.command(f"ls -lh") j.command(f"df -kh") j.command(f"sleep 3600") # sleep for 0.5 hour j.command(f"free -h")
def main(): p = batch_utils.init_arg_parser( default_cpu=4, gsa_key_file=os.path.expanduser( "~/.config/gcloud/misc-270914-cb9992ec9b25.json")) p.add_argument( "--metadata-tsv-path", default=ALL_METADATA_TSV, help="Table with columns: sample_id, bam_path, bai_path, batch") p.add_argument("--counts-tsv-path", default=ALL_COUNTS_TSV_GZ, help="Counts .tsv") g = p.add_mutually_exclusive_group() g.add_argument("--with-gtex", help="Use GTEX controls.", action="store_true") g.add_argument( "--only-gtex", help="Run on just the GTEX control samples to test FP rate.", action="store_true") p.add_argument("batch_name", nargs="+", choices=ANALYSIS_BATCHES.keys(), help="Name of RNA-seq batch to process") args = p.parse_args() if not args.force: hl.init(log="/dev/null", quiet=True) # process samples batch_label = f"OUTRIDER" if args.with_gtex: batch_label += " (with GTEx)" batch_label += ": " batch_label += ','.join(args.batch_name) with batch_utils.run_batch(args, batch_label) as batch: for batch_name in args.batch_name: batch_dict = ANALYSIS_BATCHES[batch_name] batch_tissue = batch_dict['tissue'] batch_sex = batch_dict['sex'] c_vector_of_sample_names = 'c("' + '", "'.join( batch_dict['samples']) + '")' if args.with_gtex: batch_include_GTEX_samples = "TRUE" batch_name += "_with_GTEX" elif args.only_gtex: c_vector_of_sample_names = "c()" batch_include_GTEX_samples = "TRUE" batch_name += "_only_GTEX" else: batch_include_GTEX_samples = "FALSE" batch_name += "_without_GTEX" j = batch_utils.init_job(batch, batch_name, DOCKER_IMAGE if not args.raw else None, args.cpu, args.memory, disk_size=10) batch_utils.switch_gcloud_auth_to_user_account( j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT, GCLOUD_PROJECT) # copy inputs j.command(f"""gsutil -m cp {GENCODE_TXDB} .""") j.command( f"""gsutil -m cp {args.metadata_tsv_path} {args.counts_tsv_path} .""" ) output_file = os.path.join(OUTPUT_BASE_DIR, f"{batch_name}.RDS") if not args.force and hl.hadoop_is_file(output_file): logger.info( f"Output file exists: {output_file} . Skipping {batch_name}..." ) return j.command(f"""time xvfb-run Rscript -e ' # outrider library(OUTRIDER) library(annotables) library(data.table) library(ggplot2) library(ggpubr) library(dplyr) library(purrr) library(ggrepel) library(plotly) library(stringr) library(RColorBrewer) library(ggsci) library(ggplot2) library(gtable) library(grid) library(gridExtra) possibleConfounders = c("tissue", "sex", "stranded", "read_length", "batch") # "RIN" # input tables generated by ~/project__rnaseq/code/rnaseq_methods/pipelines/gagneurlab/metadata/export_gagneur_metadata_table.py # batches generated by ~/project__rnaseq/code/rnaseq_methods/pipelines/gagneurlab/metadata/metadata_notebook.py sampleInfo = fread("{os.path.basename(args.metadata_tsv_path)}") sampleInfo$read_length = as.character(sampleInfo$read_length) GTEX_sampleIds = c() if ({batch_include_GTEX_samples}) {{ if (("{batch_sex}" == "M") || ("{batch_sex}" == "F")) {{ GTEX_sampleIds = sampleInfo[(sampleInfo$sex == "{batch_sex}") & (sampleInfo$tissue == "{batch_tissue}") & grepl("GTEX", sampleInfo$sample_id)]$sample_id }} else {{ GTEX_sampleIds = sampleInfo[(sampleInfo$tissue == "{batch_tissue}") & grepl("GTEX", sampleInfo$sample_id)]$sample_id }} }} sampleLabel = "{batch_name}_" sampleSubset = {c_vector_of_sample_names} sampleSubset = c(sampleSubset, GTEX_sampleIds) print("sampleSubset: ") print(sampleSubset) sampleInfo = sampleInfo[sampleInfo$sample_id %in% sampleSubset] if (nrow(sampleInfo) != length(sampleSubset)) {{ print(paste("ERROR: length(sampleInfo) != length(sampleSubset):", length(sampleInfo), length(sampleSubset))) quit("yes") }} geneReadCounts = fread("{os.path.basename(args.counts_tsv_path)}", select=c("gene_id", sampleSubset)) geneReadCounts = geneReadCounts[!grep("ERCC", geneReadCounts$geneId),] geneIds = geneReadCounts$gene_id colsMiusGeneId = colnames(geneReadCounts)[!colnames(geneReadCounts) %in% c("gene_id")] geneReadCounts = geneReadCounts[,..colsMiusGeneId] rownames(geneReadCounts) = geneIds cnts = as.matrix(geneReadCounts) rownames(cnts) = geneIds ncol(cnts) nrow(cnts) if (ncol(cnts) != length(sampleSubset)) {{ print(paste("ERROR: ncol(cnts) != length(sampleSubset):", ncol(cnts), length(sampleSubset))) quit("yes") }} sampleInfo[,sampleID:=sample_id] ods <- OutriderDataSet(countData=cnts, colData=sampleInfo) txdb <- loadDb("{os.path.basename(GENCODE_TXDB)}") ods <- filterExpression(ods, gtfFile=txdb, filterGenes=FALSE) #, fpkmCutoff=100) g = plotFPKM(ods) + theme_bw() + theme(legend.position="bottom") ggsave(file=paste(sampleLabel, "_plotFPKM.png", sep=""), g, device="png", type="cairo") #plotExpressedGenes(ods) ods <- estimateSizeFactors(ods) sortedSizeFactors = sort(sizeFactors(ods)) g = ggplot(data=NULL, aes(y=sortedSizeFactors, x=1:ncol(ods))) + geom_point(color="blue", size=1) + labs(x="Sample rank", y="Size factors", title="Size factor distribution") + geom_label_repel(aes(label=ifelse(sortedSizeFactors > 1.5, names(sortedSizeFactors), "")), nudge_x = -35, box.padding = 0.35, point.padding = 0.5, segment.color = "grey50") + geom_label_repel(aes(label=ifelse(sortedSizeFactors < 0.5, names(sortedSizeFactors), "")), nudge_x = 35, box.padding = 0.35, point.padding = 0.5, segment.color = "grey50") + theme_bw() ggsave(file=paste(sampleLabel, "_sizeFactors.png", sep=""), g, type="cairo") print(sort(sizeFactors(ods))[1:5]) print(paste(length(ods), "genes before filtering")) ods <- ods[mcols(ods)$passedFilter,] print(paste(length(ods), "genes after filtering")) plotCountCorHeatmap(ods, colGroups=possibleConfounders, normalized=FALSE, device="pdf", type="cairo", nRowCluster=1, nColCluster=1, filename=paste(sampleLabel, "_plotCountCorHeatmap_before_correction.pdf", sep="")) plotCountGeneSampleHeatmap(ods, colGroups=possibleConfounders, normalized=FALSE, device="pdf", type="cairo", filename=paste(sampleLabel, "_plotCountGeneSampleHeatmap_before_correction.pdf", sep="")) if (length(sampleSubset) > 5) {{ ods = findEncodingDim(ods, BPPARAM=MulticoreParam(4, progressbar=TRUE)) g = plotEncDimSearch(ods) ggsave(file=paste(sampleLabel, "_plotEncDimSearch", ".png", sep=""), g, type="cairo") optimal_q = metadata(ods)$opt }} else {{ optimal_q = length(sampleSubset) }} # increase / descrease by 25% q = optimal_q original_ods = ods ods = OUTRIDER(original_ods, verbose=TRUE, iterations=15, q=q, BPPARAM=MulticoreParam(4, progressbar=TRUE)) saveRDS(ods, paste(sampleLabel, "_ods.RDS", sep="")) plotCountCorHeatmap(ods, colGroups=possibleConfounders, normalized=TRUE, device="pdf", type="cairo", nRowCluster=1, nColCluster=1, main=paste("Count correlation heatmap q=", q, sep=""), filename=paste(sampleLabel, "_plotCountCorHeatmap_after_correction.pdf", sep="")) plotCountGeneSampleHeatmap(ods, colGroups=possibleConfounders, normalized=TRUE, device="pdf", type="cairo", main=paste("Count Gene vs Sample Heatmap q=", q, sep=""), device="pdf", type="cairo", filename=paste(sampleLabel, "_plotCountGeneSampleHeatmap_after_correction.pdf", sep="")) res = results(ods, padjCutoff=1) res = res[,c("sampleID", "geneID", "pValue", "padjust", "zScore", "rawcounts")][order(padjust),] res[, "q"] = q write.table(res, file=paste(sampleLabel, "_ods__", "q", q, "_results.tsv", sep=""), quote=FALSE, sep="\\t", row.names=FALSE) '""") j.command("gzip *.tsv") j.command( f"gsutil -m cp *.tsv.gz *.pdf *.png *.RDS {OUTPUT_BASE_DIR}") logger.info(f"Output: {output_file}")
def main(): rnaseq_sample_metadata_df = get_joined_metadata_df() #gtex_rnaseq_sample_metadata_df = get_gtex_rnaseq_sample_metadata_df() p = batch_utils.init_arg_parser( default_cpu=4, gsa_key_file=os.path.expanduser( "~/.config/gcloud/misc-270914-cb9992ec9b25.json")) grp = p.add_mutually_exclusive_group(required=True) grp.add_argument( "-b", "--rnaseq-batch-name", nargs="*", help="RNA-seq batch names to process (eg. -b batch1 batch2)", choices=set(rnaseq_sample_metadata_df['star_pipeline_batch']) | set(["gtex_muscle", "gtex_fibroblasts", "gtex_blood"])) grp.add_argument( "-s", "--rnaseq-sample-id", nargs="*", help="RNA-seq sample IDs to process (eg. -s sample1 sample2)", choices=set(rnaseq_sample_metadata_df['sample_id']) | set([ 'GTEX-1LG7Z-0005-SM-DKPQ6', 'GTEX-PX3G-0006-SM-5SI7E', 'GTEX-1KXAM-0005-SM-DIPEC' ])) args = p.parse_args() # Generate samples_df with these columns: sample_id, bam_path, bai_path, output_dir, batch_name, sex, RIN, ancestry, etc. samples_df = pd.DataFrame() if args.rnaseq_batch_name: for batch_name in args.rnaseq_batch_name: df = rnaseq_sample_metadata_df[ rnaseq_sample_metadata_df['star_pipeline_batch'] == batch_name] samples_df = transfer_metadata_columns_from_df(samples_df, df) elif args.rnaseq_sample_id: df = rnaseq_sample_metadata_df[ rnaseq_sample_metadata_df.sample_id.isin(set( args.rnaseq_sample_id))] samples_df = transfer_metadata_columns_from_df(samples_df, df) else: p.error("Must specify -b or -s") logger.info( f"Processing {len(samples_df)} sample ids: {', '.join(samples_df.sample_id[:20])}" ) # see https://hail.zulipchat.com/#narrow/stream/223457-Batch-support/topic/auth.20as.20user.20account for more details with batch_utils.run_batch(args) as batch: for sample_id in samples_df.sample_id: metadata_row = samples_df.loc[sample_id] # set job inputs & outputs input_bam, input_bai = metadata_row['bam_path'], metadata_row[ 'bai_path'] output_dir = metadata_row['output_dir'] print("Input bam: ", input_bam) output_filename = f"{sample_id}.bigWig" output_file_path = os.path.join(output_dir, output_filename) # check if output file already exists if hl.hadoop_is_file(output_file_path) and not args.force: logger.info( f"{sample_id} output file already exists: {output_file_path}. Skipping..." ) continue file_stats = hl.hadoop_stat(metadata_row['bam_path']) bam_size = int(round(file_stats['size_bytes'] / 10.**9)) disk_size = bam_size * 2 j = batch_utils.init_job(batch, f"bam=>bigWig: {sample_id}", cpu=args.cpu, memory=args.memory, disk_size=disk_size, image=DOCKER_IMAGE) batch_utils.switch_gcloud_auth_to_user_account( j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT, GCLOUD_PROJECT) j.command( f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bam} {sample_id}.bam" ) j.command( f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bai} {sample_id}.bam.bai" ) j.command( f"gsutil -u {GCLOUD_PROJECT} -m cp gs://gtex-resources/references/GRCh38.chrsizes ." ) j.command(f"touch {sample_id}.bam.bai") j.command(f"pwd && ls && date") j.command( f"python3 /src/bam2coverage.py {sample_id}.bam GRCh38.chrsizes {sample_id}" ) j.command(f"cp {output_filename} {j.output_bigWig}") j.command(f"echo Done: {output_file_path}") j.command(f"date") # copy output batch.write_output(j.output_bigWig, output_file_path) print("Output file path: ", output_file_path)
def main(): p, args = parse_args() df = pd.read_table(args.cram_and_tsv_paths_table) if {"sample_id", "cram_path", "crai_path", "variants_tsv_bgz"} - set( df.columns): p.error( f"{args.tsv_path} must contain 'sample_id', 'cram_path' columns") # check that all buckets are in "US-CENTRAL1" or are multi-regional to avoid egress charges to the Batch cluster batch_utils.set_gcloud_project(GCLOUD_PROJECT) if args.cluster: batch_utils.check_storage_bucket_region(df.cram_path) if not args.force: hl.init(log="/dev/null", quiet=True) # process samples with batch_utils.run_batch(args, batch_name=f"HaplotypeCaller -bamout") as batch: counter = 0 for _, row in tqdm.tqdm(df.iterrows(), unit=" rows", total=len(df)): if args.sample_to_process and row.sample_id not in set( args.sample_to_process): continue input_filename = os.path.basename(row.cram_path) output_prefix = input_filename.replace(".bam", "").replace(".cram", "") output_bam_path = os.path.join(args.output_dir, f"{output_prefix}.bamout.bam") output_bai_path = os.path.join(args.output_dir, f"{output_prefix}.bamout.bai") if not args.force and hl.hadoop_is_file( output_bam_path) and hl.hadoop_is_file(output_bai_path): logger.info( f"Output files exist (eg. {output_bam_path}). Skipping {input_filename}..." ) continue counter += 1 if args.num_samples_to_process and counter > args.num_samples_to_process: break j = batch_utils.init_job(batch, f"readviz: {row.sample_id}", DOCKER_IMAGE if not args.raw else None, args.cpu, args.memory) batch_utils.switch_gcloud_auth_to_user_account( j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT) local_exclude_intervals = batch_utils.localize_file( j, EXCLUDE_INTERVALS) local_fasta = batch_utils.localize_file( j, batch_utils.HG38_REF_PATHS.fasta, use_gcsfuse=True) local_fasta_fai = batch_utils.localize_file( j, batch_utils.HG38_REF_PATHS.fai, use_gcsfuse=True) batch_utils.localize_file(j, batch_utils.HG38_REF_PATHS.dict, use_gcsfuse=True) local_tsv_bgz = batch_utils.localize_file(j, row.variants_tsv_bgz) local_cram_path = batch_utils.localize_file(j, row.cram_path) local_crai_path = batch_utils.localize_file(j, row.crai_path) j.command(f"""echo -------------- echo "Start - time: $(date)" df -kh # 1) Convert variants_tsv_bgz to sorted interval list gunzip -c "{local_tsv_bgz}" | awk '{{ OFS="\t" }} {{ print( "chr"$1, $2, $2 ) }}' | bedtools slop -b {PADDING_AROUND_VARIANT} -g {local_fasta_fai} > variant_windows.bed # Sort the .bed file so that chromosomes are in the same order as in the input_cram file. # Without this, if the input_cram has a different chromosome ordering (eg. chr1, chr10, .. vs. chr1, chr2, ..) # than the interval list passed to GATK tools' -L arg, then GATK may silently skip some of regions in the -L intervals. # The sort is done by first retrieving the input_cram header and passing it to GATK BedToIntervalList. java -Xms2g -jar /gatk/gatk.jar PrintReadsHeader \ --gcs-project-for-requester-pays {GCLOUD_PROJECT} \ -R {local_fasta} \ -I "{local_cram_path}" \ -O header.bam java -Xms2g -jar /gatk/gatk.jar BedToIntervalList \ --SORT true \ --SEQUENCE_DICTIONARY header.bam \ --INPUT variant_windows.bed \ --OUTPUT variant_windows.interval_list # 2) Get reads from the input_cram for the intervals in variant_windows.interval_list time java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+DisableAttachMechanism -XX:MaxHeapSize=2000m -Xmx30000m \ -jar /gatk/GATK35.jar \ -T HaplotypeCaller \ -R {local_fasta} \ -I "{local_cram_path}" \ -L variant_windows.interval_list \ -XL {local_exclude_intervals} \ --disable_auto_index_creation_and_locking_when_reading_rods \ -ERC GVCF \ --max_alternate_alleles 3 \ -variant_index_parameter 128000 \ -variant_index_type LINEAR \ --read_filter OverclippedRead \ -bamout "{output_prefix}.bamout.bam" \ -o "{output_prefix}.gvcf" |& grep -v "^DEBUG" bgzip "{output_prefix}.gvcf" tabix "{output_prefix}.gvcf.gz" gsutil -m cp "{output_prefix}.bamout.bam" {args.output_dir} gsutil -m cp "{output_prefix}.bamout.bai" {args.output_dir} gsutil -m cp "{output_prefix}.gvcf.gz" {args.output_dir} gsutil -m cp "{output_prefix}.gvcf.gz.tbi" {args.output_dir} ls -lh echo --------------; free -h; df -kh; uptime; set +xe; echo "Done - time: $(date)"; echo -------------- """)
def main(): rnaseq_sample_metadata_df = get_joined_metadata_df() #gtex_rnaseq_sample_metadata_df = get_gtex_rnaseq_sample_metadata_df() p = batch_utils.init_arg_parser( default_cpu=4, gsa_key_file=os.path.expanduser( "~/.config/gcloud/misc-270914-cb9992ec9b25.json")) grp = p.add_mutually_exclusive_group(required=True) grp.add_argument( "-b", "--rnaseq-batch-name", nargs="*", help="RNA-seq batch names to process (eg. -b batch1 batch2)", choices=set(rnaseq_sample_metadata_df['star_pipeline_batch']) | set(["gtex_muscle", "gtex_fibroblasts", "gtex_blood"])) grp.add_argument( "-s", "--rnaseq-sample-id", nargs="*", help="RNA-seq sample IDs to process (eg. -s sample1 sample2)", choices=set(rnaseq_sample_metadata_df['sample_id']) | set([ 'GTEX-1LG7Z-0005-SM-DKPQ6', 'GTEX-PX3G-0006-SM-5SI7E', 'GTEX-1KXAM-0005-SM-DIPEC' ])) args = p.parse_args() # Generate samples_df with these columns: sample_id, star_SJ_out_tab, output_dir, batch_name samples_df = pd.DataFrame() if args.rnaseq_batch_name: for batch_name in args.rnaseq_batch_name: df = rnaseq_sample_metadata_df[ rnaseq_sample_metadata_df['star_pipeline_batch'] == batch_name] samples_df = transfer_metadata_columns_from_df(samples_df, df) elif args.rnaseq_sample_id: df = rnaseq_sample_metadata_df[ rnaseq_sample_metadata_df.sample_id.isin(set( args.rnaseq_sample_id))] samples_df = transfer_metadata_columns_from_df(samples_df, df) else: p.error("Must specify -b or -s") logger.info( f"Processing {len(samples_df)} sample ids: {', '.join(samples_df.sample_id[:20])}" ) # see https://hail.zulipchat.com/#narrow/stream/223457-Batch-support/topic/auth.20as.20user.20account for more details with batch_utils.run_batch(args) as batch: for sample_id in samples_df.sample_id: metadata_row = samples_df.loc[sample_id] # set job inputs & outputs output_dir = metadata_row['output_dir'] print("Input file: ", metadata_row['star_SJ_out_tab']) output_filename = f"{sample_id}.junctions.bed.gz" output_bed_gz_file_path = os.path.join(output_dir, output_filename) # check if output file already exists if hl.hadoop_is_file(output_bed_gz_file_path) and not args.force: logger.info( f"{sample_id} output file already exists: {output_bed_gz_file_path}. Skipping..." ) continue j = batch_utils.init_job(batch, name=f"tab=>bed: {sample_id}", cpu=args.cpu, memory=args.memory, disk_size=5, image=DOCKER_IMAGE) batch_utils.switch_gcloud_auth_to_user_account( j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT, GCLOUD_PROJECT) j.command( f"gsutil -u {GCLOUD_PROJECT} -m cp {metadata_row['star_SJ_out_tab']} ." ) j.command( f"gsutil -u {GCLOUD_PROJECT} -m cp gs://macarthurlab-rnaseq/ref/gencode.v26.annotation.gff3.gz ." ) j.command(f"pwd && ls && date") j.command( f"python3 /convert_SJ_out_tab_to_junctions_bed.py -g gencode.v26.annotation.gff3.gz {os.path.basename(metadata_row['star_SJ_out_tab'])}" ) j.command(f"cp {output_filename} {j.output_bed_gz}") j.command(f"cp {output_filename}.tbi {j.output_bed_gz_tbi}") j.command(f"echo Done: {output_bed_gz_file_path}") j.command(f"date") # copy output batch.write_output(j.output_bed_gz, output_bed_gz_file_path) batch.write_output(j.output_bed_gz_tbi, f"{output_bed_gz_file_path}.tbi") print("Output file path: ", output_bed_gz_file_path)
def main(): p = batch_utils.init_arg_parser( default_cpu=4, gsa_key_file=os.path.expanduser( "~/.config/gcloud/misc-270914-cb9992ec9b25.json")) p.add_argument("--with-gtex", help="Use GTEX controls.", action="store_true") p.add_argument("--skip-step1", action="store_true", help="Skip count-split-reads step") p.add_argument("--skip-step2", action="store_true", help="Skip compute-PSI step") p.add_argument("--skip-step3", action="store_true", help="Skip compute-best-Q step") p.add_argument("-m1", "--memory-step1", type=float, help="Batch: (optional) memory in gigabytes (eg. 3.75)", default=3.75) p.add_argument("-m2", "--memory-step2", type=float, help="Batch: (optional) memory in gigabytes (eg. 3.75)", default=3.75) p.add_argument( "--metadata-tsv-path", default=ALL_METADATA_TSV, help="Table with columns: sample_id, bam_path, bai_path, batch") p.add_argument("batch_name", nargs="+", choices=ANALYSIS_BATCHES.keys(), help="Name of RNA-seq batch to process") args = p.parse_args() hl.init(log="/dev/null", quiet=True) with hl.hadoop_open(args.metadata_tsv_path) as f: samples_df_unmodified = pd.read_table(f).set_index("sample_id", drop=False) batch_label = f"FRASER" if args.with_gtex: batch_label += " (with GTEx)" batch_label += ": " batch_label += ','.join(args.batch_name) with batch_utils.run_batch(args, batch_label) as batch: for batch_name in args.batch_name: samples_df = samples_df_unmodified batch_dict = ANALYSIS_BATCHES[batch_name] batch_tissue = batch_dict['tissue'] batch_sex = batch_dict['sex'] sample_ids = list(batch_dict['samples']) if args.with_gtex: batch_name += "_with_GTEX" samples_df_filter = (samples_df.tissue == batch_tissue) samples_df_filter &= samples_df.sample_id.str.startswith( "GTEX") if batch_sex == "M" or batch_sex == "F": samples_df_filter &= (samples_df.sex == batch_sex) sample_ids += list(samples_df[samples_df_filter].sample_id) else: batch_name += "_without_GTEX" samples_df = samples_df.loc[sample_ids] byte_string = ", ".join(sorted(samples_df.sample_id)).encode() h = hashlib.md5(byte_string).hexdigest().upper() sample_set_label = f"{batch_name}__{len(samples_df.sample_id)}_samples_{h[:10]}" logger.info( f"Processing {sample_set_label}: {len(samples_df)} sample ids: {', '.join(samples_df.sample_id[:20])}" ) split_reads_samples = [] split_reads_output_files = [] split_reads_jobs = {} non_split_reads_output_files = [] non_split_reads_jobs = {} j_extract_splice_junctions = None j_calculate_psi_values = None j_calculate_best_q = None # based on docs @ https://bioconductor.org/packages/devel/bioc/vignettes/FRASER/inst/doc/FRASER.pdf # step 1: count spliced reads # step 2: count non-spliced reads at acceptors & donors of splice junctions detected in step 1 for step in 1, 2: for sample_id in samples_df.sample_id: metadata_row = samples_df.loc[sample_id] # set job inputs & outputs input_bam, input_bai = metadata_row[ 'bam_path'], metadata_row['bai_path'] if "GTEX" in sample_id: output_dir_for_sample_specific_data = "gs://macarthurlab-rnaseq/gtex_v8/fraser_count_rna/" else: output_dir_for_sample_specific_data = f"gs://macarthurlab-rnaseq/{metadata_row['batch']}/fraser_count_rna/" output_dir_for_batch_specific_data = f"gs://macarthurlab-rnaseq/gagneur/fraser/results/{sample_set_label}" output_file_path_splice_junctions_RDS = os.path.join( output_dir_for_batch_specific_data, f"spliceJunctions_{sample_set_label}.RDS") output_file_path_calculated_psi_values_tar_gz = os.path.join( output_dir_for_batch_specific_data, f"calculatedPSIValues_{sample_set_label}.tar.gz") output_file_path_calculated_best_q_tar_gz = os.path.join( output_dir_for_batch_specific_data, f"calculatedBestQ_{sample_set_label}.tar.gz") output_file_path_fraser_analysis_tar_gz = os.path.join( output_dir_for_batch_specific_data, f"fraserAnalysis_using_PCA_{sample_set_label}.tar.gz") output_file_path_fraser_analysis_results_only_tar_gz = os.path.join( output_dir_for_batch_specific_data, f"fraserAnalysis_using_PCA_{sample_set_label}_results_only.tar.gz" ) print("Input bam: ", input_bam) if step == 1: output_file_path = os.path.join( output_dir_for_sample_specific_data, f"fraser_count_split_reads_{sample_id}.tar.gz") memory = args.memory_step1 elif step == 2: output_file_path = os.path.join( output_dir_for_batch_specific_data, f"fraser_count_non_split_reads_{sample_id}__{sample_set_label}.tar.gz" ) memory = args.memory_step2 if step == 1: split_reads_samples.append(sample_id) split_reads_output_files.append(output_file_path) elif step == 2: non_split_reads_output_files.append(output_file_path) if (step == 1 and args.skip_step1) or (step == 2 and args.skip_step2): continue # check if output file already exists if not args.force and hl.hadoop_is_file(output_file_path): logger.info( f"{sample_id} output file already exists: {output_file_path}. Skipping..." ) continue if not args.local: file_stats = hl.hadoop_stat(metadata_row['bam_path']) bam_size = int(round(file_stats['size_bytes'] / 10.**9)) disk_size = bam_size * 2 else: disk_size = None job_label = f"Count {'split' if step == 1 else 'non-split'} reads" j = batch_utils.init_job(batch, f"{job_label}: {sample_id}", cpu=args.cpu, memory=memory, disk_size=disk_size, image=DOCKER_IMAGE) batch_utils.switch_gcloud_auth_to_user_account( j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT, GCLOUD_PROJECT) j.command( f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bam} {sample_id}.bam" ) j.command( f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bai} {sample_id}.bam.bai" ) j.command(f"touch {sample_id}.bam.bai") bam_path = f"{sample_id}.bam" j.command(f"pwd && ls -lh && date") if step == 1: # count split reads j.command(f"""time xvfb-run Rscript -e ' library(FRASER) library(data.table) sampleTable = data.table(sampleID=c("{sample_id}"), bamFile=c("{bam_path}")) print(sampleTable) fds = FraserDataSet(colData=sampleTable, workingDir=".", bamParam=ScanBamParam(mapqFilter=0), strandSpecific=0L) getSplitReadCountsForAllSamples(fds) # saves results to cache/ '""") elif step == 2: if sample_id in split_reads_jobs: j.depends_on(split_reads_jobs[sample_id]) if j_extract_splice_junctions: j.depends_on(j_extract_splice_junctions) j.command( f"gsutil -m cp {output_file_path_splice_junctions_RDS} ." ) # count non-split reads j.command(f"""time xvfb-run Rscript -e ' library(FRASER) library(data.table) spliceJunctions = readRDS("{os.path.basename(output_file_path_splice_junctions_RDS)}") sampleTable = data.table(sampleID=c("{sample_id}"), bamFile=c("{bam_path}")) print(sampleTable) fds = FraserDataSet(colData=sampleTable, workingDir=".", bamParam=ScanBamParam(mapqFilter=0), strandSpecific=0L) getNonSplitReadCountsForAllSamples(fds, spliceJunctions) # saves results to cache/ '""") j.command(f"ls -lh .") j.command( f"tar czf {os.path.basename(output_file_path)} cache") j.command( f"gsutil -m cp {os.path.basename(output_file_path)} {output_file_path}" ) j.command(f"echo Done: {output_file_path}") j.command(f"date") print("Output file path: ", output_file_path) if step == 1: split_reads_jobs[sample_id] = j elif step == 2: non_split_reads_jobs[sample_id] = j if len(split_reads_output_files) == 0: break if step == 1 and not args.skip_step1: if hl.hadoop_is_file(output_file_path_splice_junctions_RDS ) and not args.force: logger.info( f"{output_file_path_splice_junctions_RDS} file already exists. Skipping extractSpliceJunctions step..." ) continue j_extract_splice_junctions = batch_utils.init_job( batch, f"{sample_set_label}: Extract splice-junctions", disk_size=30, memory=60, image=DOCKER_IMAGE) for j in split_reads_jobs.values(): j_extract_splice_junctions.depends_on(j) extract_splice_junctions( j_extract_splice_junctions, split_reads_output_files, args.cpu, output_file_path_splice_junctions_RDS) elif step == 2 and not args.skip_step2: if hl.hadoop_is_file( output_file_path_calculated_psi_values_tar_gz ) and not args.force: logger.info( f"{output_file_path_calculated_psi_values_tar_gz} file already exists. Skipping calculatePSIValues step..." ) continue num_cpu = 4 if args.local else 16 memory = 60 j_calculate_psi_values = batch_utils.init_job( batch, f"{sample_set_label}: Calculate PSI values", disk_size=50, cpu=num_cpu, memory=memory, image=DOCKER_IMAGE) if j_extract_splice_junctions: j_calculate_psi_values.depends_on( j_extract_splice_junctions) for j in non_split_reads_jobs.values(): j_calculate_psi_values.depends_on(j) calculate_psi_values( j_calculate_psi_values, sample_set_label, split_reads_output_files, non_split_reads_output_files, output_file_path_splice_junctions_RDS, args.metadata_tsv_path, num_cpu, output_file_path_calculated_psi_values_tar_gz) # compute Best Q if args.skip_step3: logger.info(f"Skipping calculatedBestQ step...") elif hl.hadoop_is_file(output_file_path_calculated_best_q_tar_gz ) and not args.force: logger.info( f"{output_file_path_calculated_best_q_tar_gz} file already exists. Skipping calculatedBestQ step..." ) else: num_cpu = 4 if args.local else 16 memory = 3.75 * num_cpu j_calculate_best_q = batch_utils.init_job( batch, f"{sample_set_label}: Calculate Best Q", disk_size=50, cpu=num_cpu, memory=memory, image=DOCKER_IMAGE) if j_calculate_psi_values: j_calculate_best_q.depends_on(j_calculate_psi_values) calculate_best_q( j_calculate_best_q, sample_set_label, 4, output_file_path_calculated_psi_values_tar_gz, output_file_path_calculated_best_q_tar_gz) # output_file_path_fraser_analysis_tar_gz if hl.hadoop_is_file( output_file_path_fraser_analysis_results_only_tar_gz ) and not args.force: logger.info( f"{output_file_path_fraser_analysis_results_only_tar_gz} file already exists. Skipping run_fraser_analysis step..." ) else: num_cpu = 4 if args.local else 16 memory = 3.75 * num_cpu j_fraser_analysis = batch_utils.init_job( batch, f"{sample_set_label}: Run Fraser Analysis", disk_size=50, cpu=num_cpu, memory=memory, image=DOCKER_IMAGE) if j_calculate_best_q: j_fraser_analysis.depends_on(j_calculate_best_q) run_fraser_analysis( j_fraser_analysis, sample_set_label, 4, output_file_path_calculated_best_q_tar_gz, output_file_path_fraser_analysis_tar_gz, output_file_path_fraser_analysis_results_only_tar_gz)
def main(): p, args = parse_args() df = pd.read_table(args.cram_and_tsv_paths_table) if {"sample_id", "output_bamout_bam", "output_bamout_bai", "variants_tsv_bgz"} - set(df.columns): p.error(f"{args.tsv_path} must contain 'sample_id', 'output_bamout_bam', 'variants_tsv_bgz' columns") if args.num_samples_to_process: if args.random: df = df.sample(n=args.num_samples_to_process) else: df = df.iloc[:args.num_samples_to_process] if args.sample_to_process: df = df[df.sample_id.isin(set(args.sample_to_process))] logging.info(f"Processing {len(df)} samples") # check that all buckets are in "US-CENTRAL1" or are multi-regional to avoid egress charges to the Batch cluster batch_utils.set_gcloud_project(GCLOUD_PROJECT) with open("deidentify_bamout.py", "rt") as f: deidentify_bamouts_script = f.read() # process sample(s) if not args.sample_to_process and not args.num_samples_to_process: # if processing entire table, listing all files up front ends up being faster existing_deidentify_output_bams = subprocess.check_output(f"gsutil -m ls {args.output_dir}/*.deidentify_output.bam", shell=True, encoding="UTF-8").strip().split("\n") existing_deidentify_output_sorted_bams = subprocess.check_output(f"gsutil -m ls {args.output_dir}/*.deidentify_output.sorted.bam", shell=True, encoding="UTF-8").strip().split("\n") hl.init(log="/dev/null") with batch_utils.run_batch(args, batch_name=f"deidentify bamouts: {len(df)} samples") as batch: for _, row in tqdm.tqdm(df.iterrows(), unit=" samples"): output_bam_path = os.path.join(args.output_dir, f"{row.sample_id}.deidentify_output.bam") output_sorted_bam_path = os.path.join(args.output_dir, f"{row.sample_id}.deidentify_output.sorted.bam") if args.sample_to_process or args.num_samples_to_process: run_deidentify = args.force or not hl.hadoop_is_file(output_bam_path) run_sort = run_deidentify or not hl.hadoop_is_file(output_sorted_bam_path) else: run_deidentify = args.force or output_bam_path not in existing_deidentify_output_bams run_sort = run_deidentify or output_sorted_bam_path not in existing_deidentify_output_sorted_bams if run_deidentify or run_sort: bamout_stat = hl.hadoop_stat(row.output_bamout_bam) cpu = 0.25 if bamout_stat['size_bytes'] > 0.25 * 20_000_000_000: cpu = 0.5 if bamout_stat['size_bytes'] > 0.5 * 20_000_000_000: cpu = 1 if bamout_stat['size_bytes'] > 1 * 20_000_000_000: cpu = 2 if run_deidentify: j = batch_utils.init_job(batch, f"{row.sample_id} - deidentify - cpu:{cpu}", DOCKER_IMAGE if not args.raw else None, cpu=cpu, disk_size=21*cpu) batch_utils.switch_gcloud_auth_to_user_account(j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT) local_tsv_path = batch_utils.localize_file(j, row.variants_tsv_bgz, use_gcsfuse=True) local_exclude_tsv_path = batch_utils.localize_file(j, row.exclude_variants_tsv_bgz, use_gcsfuse=True) local_bamout_path = batch_utils.localize_file(j, row.output_bamout_bam, use_gcsfuse=True) batch_utils.localize_file(j, row.output_bamout_bai, use_gcsfuse=True) j.command(f"""echo -------------- echo "Start - time: $(date)" df -kh cat <<EOF > deidentify_bamout.py {deidentify_bamouts_script} EOF time python3 deidentify_bamout.py -x "{local_exclude_tsv_path}" "{row.sample_id}" "{local_bamout_path}" "{local_tsv_path}" ls -lh gsutil -m cp "{row.sample_id}.deidentify_output.bam" {args.output_dir}/ gsutil -m cp "{row.sample_id}.deidentify_output.db" {args.output_dir}/ echo --------------; free -h; df -kh; uptime; set +xe; echo "Done - time: $(date)"; echo -------------- """) else: logger.info(f"Skipping deidentify {row.sample_id}...") if run_sort: j2 = batch_utils.init_job(batch, f"{row.sample_id} - sort - cpu:{cpu}", DOCKER_IMAGE if not args.raw else None, cpu=cpu) batch_utils.switch_gcloud_auth_to_user_account(j2, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT) if run_deidentify: j2.depends_on(j) local_bamout_path = batch_utils.localize_file(j2, output_bam_path, use_gcsfuse=True) j2.command(f"""echo -------------- echo "Start - time: $(date)" df -kh samtools sort -o "{row.sample_id}.deidentify_output.sorted.bam" "{local_bamout_path}" samtools index "{row.sample_id}.deidentify_output.sorted.bam" ls -lh gsutil -m cp "{row.sample_id}.deidentify_output.sorted.bam" {args.output_dir}/ gsutil -m cp "{row.sample_id}.deidentify_output.sorted.bam.bai" {args.output_dir}/ echo --------------; free -h; df -kh; uptime; set +xe; echo "Done - time: $(date)"; echo -------------- """) elif run_sort: logger.info(f"Sorted output files exist (eg. {output_sorted_bam_path}). Skipping sort for {row.sample_id}...")
def main(): p = batch_utils.init_arg_parser( default_cpu=NUM_CPU, default_memory=NUM_CPU*3.75, gsa_key_file=os.path.expanduser("~/.config/gcloud/misc-270914-cb9992ec9b25.json")) grp = p.add_mutually_exclusive_group(required=True) grp.add_argument("--all", action="store_true", help="run all samples") grp.add_argument("-s", "--sample", help="process specific sample name(s)", action="append") grp.add_argument("-n", "--n-samples", type=int, help="run on the 1st n samples only. Useful for debugging") p.add_argument("--offset", type=int, default=0, help="apply this offset before applying -n. Useful for debugging") p.add_argument("--model", help="Which DeepTrio model to use", choices={"WES", "WGS", "PACBIO"}, required=True) p.add_argument("trios_tsv", help="Trios tsv", default="trios.tsv") args = p.parse_args() if not os.path.isfile(args.trios_tsv): p.error(f"File not found: {args.trios_tsv}") if args.trios_tsv.endswith(".xls") or args.trios_tsv.endswith(".xlsx"): df = pd.read_excel(args.trios_tsv) else: df = pd.read_table(args.trios_tsv) missing_columns = EXPECTED_COLUMNS - set(df.columns) if missing_columns: p.error(f"{args.trios_tsv} is missing columns: {missing_columns}") if args.n_samples: df = df[args.offset:args.offset+args.n_samples] if args.sample: df = df[df.sample_id.isin(set(args.sample))] if len(df) < len(set(filter(None, args.sample))): p.error(", ".join(set(args.sample) - set(df.sample_id)) + ": sample ids not found or don't have a bam file path") logger.info(f"Processing {len(df)} sample(s): " + ", ".join(list(df.sample_id[:10]))) else: logger.info(f"Processing all {len(df)} samples") output_subdir = ".".join(os.path.basename(args.trios_tsv).split(".")[:-1]) existing_output_files = batch_utils.generate_path_to_file_size_dict( os.path.join(OUTPUT_BASE_DIR, f"{output_subdir}/results_*.tar.gz")) # process samples with batch_utils.run_batch(args, batch_name=f"DeepTrio: " + (", ".join(df.individual_id) if len(df) < 5 else f"{len(df)} trio(s)")) as batch: for i, row in df.iterrows(): name = re.sub(".bam$|.cram$", "", os.path.basename(row.reads)) name_parent1 = re.sub(".bam$|.cram$", "", os.path.basename(row.parent1_reads)) name_parent2 = re.sub(".bam$|.cram$", "", os.path.basename(row.parent2_reads)) output_file = os.path.join(OUTPUT_BASE_DIR, f"{output_subdir}/results_{name}.tar.gz") if not args.force and output_file in existing_output_files: logger.info(f"Output file exists: {output_file} . Skipping {row.individual_id}...") continue # init Job j = batch_utils.init_job(batch, None, DEEP_TRIO_DOCKER_IMAGE_WITHOUT_GPU if not args.raw else None, cpu=NUM_CPU) batch_utils.switch_gcloud_auth_to_user_account(j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT, GCLOUD_PROJECT) # localize files local_ref_fasta_path = batch_utils.localize_file(j, row.ref_fasta, use_gcsfuse=True) local_reads_path = batch_utils.localize_via_temp_bucket(j, row.reads) local_parent1_reads_path = batch_utils.localize_via_temp_bucket(j, row.parent1_reads) local_parent2_reads_path = batch_utils.localize_via_temp_bucket(j, row.parent2_reads) batch_utils.localize_file(j, row.ref_fasta_fai, use_gcsfuse=True) batch_utils.localize_via_temp_bucket(j, row.reads_index) batch_utils.localize_via_temp_bucket(j, row.parent1_reads_index) batch_utils.localize_via_temp_bucket(j, row.parent2_reads_index) local_ref_cache_tar_gz_path = batch_utils.localize_file(j, "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.ref_cache.tar.gz", use_gcsfuse=True) # --regions chr22:38982347-38992804 \ j.command(f"""mkdir ref_cache cd ref_cache tar xzf {local_ref_cache_tar_gz_path} 2>&1 | grep -v '^tar:' || true export REF_PATH="/ref_cache/ref/cache/%2s/%2s/%s:http://www.ebi.ac.uk/ena/cram/md5/%s" export REF_CACHE="/ref_cache/ref/cache/%2s/%2s/%s" mkdir "/results_{name}" cd "/results_{name}" /opt/deepvariant/bin/deeptrio/run_deeptrio \ --model_type {args.model} \ --ref {local_ref_fasta_path} \ --reads_child "{local_reads_path}" \ --reads_parent1 "{local_parent1_reads_path}" \ --reads_parent2 "{local_parent2_reads_path}" \ --output_gvcf_child "variants_{name}.gvcf.gz" \ --output_gvcf_parent1 "variants_{name_parent1}.gvcf.gz" \ --output_gvcf_parent2 "variants_{name_parent2}.gvcf.gz" \ --output_vcf_child "variants_{name}.vcf.gz" \ --output_vcf_parent1 "variants_{name_parent1}.vcf.gz" \ --output_vcf_parent2 "variants_{name_parent2}.vcf.gz" \ --sample_name_child "{name}" \ --sample_name_parent1 "{name_parent1}" \ --sample_name_parent2 "{name_parent2}" \ --vcf_stats_report rm *.gvcf.gz* cd / tar czf "results_{name}.tar.gz" "/results_{name}" gsutil -m cp "results_{name}.tar.gz" {output_file}""")
"--save-individual-tables", action="store_true", help="Also export individual .bed files with additional columns") p.add_argument("batch_name", nargs="+", choices=analysis_batches | star_pipeline_batches, help="Name of RNA-seq batch to process") args = p.parse_args() if not args.force: hl.init(log="/dev/null", quiet=True) # process batches batch_label = args.batch_name[0] if len( args.batch_name) == 1 else f"{len(args.batch_name)} batches" with batch_utils.run_batch( args, batch_name=f"combine junctions: {batch_label}") as batch: for batch_name in args.batch_name: if batch_name in star_pipeline_batches: output_dir = f"gs://macarthurlab-rnaseq/{batch_name}/combined_SJ_out_tables/" SJ_out_tab_paths = list(rnaseq_sample_metadata_df[ rnaseq_sample_metadata_df["star_pipeline_batch"] == batch_name].star_SJ_out_tab) elif batch_name in analysis_batches: output_dir = f"gs://macarthurlab-rnaseq/combined_SJ_out_tables/{batch_name}/" SJ_out_tab_paths = rnaseq_sample_metadata_df[ rnaseq_sample_metadata_df["sample_id"].isin( ANALYSIS_BATCHES[batch_name] ["samples"])].star_SJ_out_tab else: p.error(f"Unexpected batch name: {batch_name}")
def main(): p, args = parse_args() df = pd.read_table(args.cram_and_tsv_paths_table) if { "sample_id", "output_bamout_bam", "output_bamout_bai", "variants_tsv_bgz" } - set(df.columns): p.error( f"{args.tsv_path} must contain 'sample_id', 'output_bamout_bam', 'variants_tsv_bgz' columns" ) num_groups = int(math.ceil(len(df) / args.group_size)) logging.info( f"Creating {num_groups} group(s) with {args.group_size} samples in each" ) groups = [] for i in range(num_groups): if args.num_groups_to_process and i >= args.num_groups_to_process: break group = df.iloc[i::num_groups] groups.append(group) logging.info(f"--- group #{i}:") logging.info(group) # check that all buckets are in "US-CENTRAL1" or are multi-regional to avoid egress charges to the Batch cluster batch_utils.set_gcloud_project(GCLOUD_PROJECT) if not args.skip_step1: existing_combined_bamout_bams = batch_utils.generate_path_to_file_size_dict( f"{OUTPUT_BUCKET}/*.bam") input_bam_size_dict = batch_utils.generate_path_to_file_size_dict( f"{INPUT_BAM_BUCKET}/*.deidentify_output.sorted.bam") if not args.skip_step2: existing_combined_dbs = batch_utils.generate_path_to_file_size_dict( f"{OUTPUT_BUCKET}/*.chr*.db") input_db_size_dict = batch_utils.generate_path_to_file_size_dict( f"{INPUT_BAM_BUCKET}/*.deidentify_output.db") # process groups with batch_utils.run_batch( args, batch_name= f"combine readviz bams: {len(groups)} group(s) (gs{args.group_size}_gn{num_groups}__s{len(df)})" ) as batch: chrom_to_combine_db_jobs = collections.defaultdict(list) chrom_to_combined_db_paths = collections.defaultdict(list) errors = 0 temp_dir = "./temp_sql_files__combine_group" for i, group in enumerate(tqdm.tqdm(groups, unit=" groups")): md5_hash = hashlib.md5(", ".join(sorted(list( group.sample_id))).encode('utf-8')).hexdigest() combined_bamout_id = f"s{len(df)}_gs{args.group_size}_gn{num_groups}_gi{i:04d}_h{md5_hash[-9:]}" for chrom in ALL_CHROMOSOMES: chrom_to_combined_db_paths[chrom].append( f"{args.output_dir}/{combined_bamout_id}.chr{chrom}.db") if not args.skip_step1 and not args.db_names_to_process: errors += combine_bam_files_in_group( args, batch, combined_bamout_id, group, input_bam_size_dict, existing_combined_bamout_bams) if not args.skip_step2: errors += combine_db_files_in_group_for_chrom( args, batch, combined_bamout_id, group, chrom_to_combine_db_jobs, input_db_size_dict, existing_combined_dbs, temp_dir=temp_dir) if not args.skip_step2: os.system(f"gsutil -m cp -r {temp_dir} gs://gnomad-bw2/") temp_dir = "./temp_sql_files__combine_all_per_chrom" if not args.skip_step3 and not args.num_groups_to_process and not errors: # only do this after processing all groups combine_all_dbs_for_chrom( args, batch, f"s{len(df)}_gs{args.group_size}_gn{num_groups}", chrom_to_combined_db_paths, chrom_to_combine_db_jobs, temp_dir=temp_dir) os.system(f"gsutil -m cp -r {temp_dir} gs://gnomad-bw2/")