def main() -> None: logging.basicConfig(level=logging.DEBUG, format="%(asctime)s:%(levelname)s: %(message)s") p = argparse.ArgumentParser() p.add_argument('--cache_dir', default=util.DEFAULT_CACHE_DIR, help='Benchmark cache dir') p.add_argument('--result_dir', default=util.DEFAULT_RESULT_DIR, help='Benchmark result dir') args = p.parse_args() util.s3_cache_files([ util.REFERENCE_DIR + '/gencode.v26.whole_genes.fa', util.REFERENCE_DIR + '/all_pair_art_lod_gpair_merged.txt' ], args.cache_dir) for sample in util.TITRATION_SAMPLES: logging.info('Start benchmark %s', sample.name) result_dir = args.result_dir + '/' + sample.name try: os.makedirs(result_dir, 0o755) except: logging.error("mkdir %s failed", result_dir) if os.path.exists(result_dir + "/filtered.fa"): logging.info("Skip %s", result_dir) continue util.s3_cache_files(util.expand_fastq_files(sample.paths), args.cache_dir) cached_r1 = ",".join([ args.cache_dir + '/' + os.path.basename(fq.r1) for fq in sample.paths ]) cached_r2 = ",".join([ args.cache_dir + '/' + os.path.basename(fq.r2) for fq in sample.paths ]) cached_ref = args.cache_dir + '/gencode.v26.whole_genes.fa' cached_cosmic_fusion = args.cache_dir + '/all_pair_art_lod_gpair_merged.txt' af4_args = [ str(util.af4_path()), f'-log_dir={result_dir}', f'-pprof=:12345', f'-mutex-profile-rate=1000', f'-block-profile-rate=1000', f'-r1={cached_r1}', f'-r2={cached_r2}', f'-fasta-output={result_dir}/all.fa', f'-filtered-output={result_dir}/filtered.fa', f'-transcript={cached_ref}', f'-max-genes-per-kmer=2', f'-max-proximity-distance=1000', f'-max-proximity-genes=5', f'-unstranded-prep', f'-cosmic-fusion={cached_cosmic_fusion}' ] util.check_call(af4_args) logging.info('Finished benchmark %d: %s', sample.name) logging.info("Runtime stats: %s", util.run_stats(Path(result_dir))) for path in glob.glob(f'{args.cache_dir}/*rerun*'): try: os.remove(path) except: logging.error("failed to remove " + path)
def main() -> None: logging.basicConfig(level=logging.DEBUG, format="%(asctime)s:%(levelname)s: %(message)s") p = argparse.ArgumentParser() p.add_argument('--cache_dir', default=util.DEFAULT_CACHE_DIR, help='Benchmark cache dir') p.add_argument('--result_dir', default=util.DEFAULT_RESULT_DIR, help='Benchmark result dir') p.add_argument('--starfusion_data_dir', default='/scratch-nvme/starfusion', help='Directory for expanding starfusion plug-n-play files') p.add_argument( '--run', action='append', choices=['af4', 'starfusion'], help='List of systems to run. If unset, run all the configured systems' ) p.add_argument( '--starfusion_plug_n_play_targz', default=os.environ['HOME'] + '/GRCh38_v27_CTAT_lib_Feb092018.plug-n-play.tar.gz', help= 'Tar.gz file of starfusion plug-n-play file. https://github.com/STAR-Fusion/STAR-Fusion/wiki#data-resources-required' ) p.add_argument( '--starfusion_targz', default=os.environ['HOME'] + '/STAR-Fusion-v1.5.0.FULL.tar.gz', help= 'Tar.gz file of starfusion source package. https://github.com/STAR-Fusion/STAR-Fusion/wiki#data-resources-required' ) args = p.parse_args() if not args.run: args.run = ['af4', 'starfusion'] for sample in util.RNA_SAMPLES: fastq_files: List[str] = [] cached_file_pairs: List[util.FASTQPair] = [] for fp in sample.paths: assert fp.r1.replace("R1", "R2") == fp.r2, fp.r2 fastq_files += [fp.r1, fp.r2] cached_file_pairs.append( util.FASTQPair( r1=args.cache_dir + '/' + os.path.basename(fp.r1), r2=args.cache_dir + '/' + os.path.basename(fp.r2))) util.s3_cache_files(fastq_files, args.cache_dir) if 'af4' in args.run: run_af4(sample.name, cached_file_pairs, args) if 'starfusion' in args.run: run_starfusion(sample.name, cached_file_pairs, args)
def run_af4( sample_name: str, cached_file_pairs: List[util.FASTQPair], cosmic_fusion_path: str, args: Any, ): ref_path = "s3://grail-publications/resources/gencode.v26.whole_genes.fa" util.s3_cache_files([ref_path, cosmic_fusion_path], args.cache_dir) cached_r1 = ",".join([ args.cache_dir + "/" + os.path.basename(fp.r1) for fp in cached_file_pairs ]) cached_r2 = ",".join([ args.cache_dir + "/" + os.path.basename(fp.r2) for fp in cached_file_pairs ]) for mode in ["denovo", "targeted"]: result_dir = args.result_dir + "/" + os.path.basename(sample_name + "-" + mode) if os.path.exists(result_dir + "/filtered.fa"): logging.info("Skipping benchmark: %s", result_dir) continue logging.info("Start af4 benchmark: %s", result_dir) try: os.makedirs(result_dir, 0o755) except: logging.error("mkdir %s failed", result_dir) af4_args = [ str(util.af4_path()), f"-log_dir={result_dir}", f"-pprof=:12345", f"-mutex-profile-rate=1000", f"-block-profile-rate=1000", f"-r1={cached_r1}", f"-r2={cached_r2}", f"-max-genes-per-kmer=2", f"-max-proximity-distance=1000", f"-max-proximity-genes=5", f"-fasta-output={result_dir}/all.fa", f"-filtered-output={result_dir}/filtered.fa", f"-transcript={args.cache_dir}/gencode.v26.250padded_separate_jns_transcripts_parsed_no_mt_no_overlap_no_pary_no_versioned.fa", ] if mode == "targeted": af4_args.append( f"-cosmic-fusion={args.cache_dir}/all_pair_art_lod_gpair_merged.txt" ) util.check_call(af4_args) logging.info("Finished benchmark: %s", result_dir) logging.info("Runtime stats: %s", util.run_stats(Path(result_dir)))
def main() -> None: logging.basicConfig(level=logging.DEBUG, format="%(asctime)s:%(levelname)s: %(messge)s") p = argparse.ArgumentParser() p.add_argument( "--starfusion_dir", default="/scratch-nvme/xyang/result/", help= "Starfusion result dir, which then contains individual sample result", ) p.add_argument("--cache_dir", default="/scratch-nvme/xyang/cache/", help="cache dir") args = p.parse_args() local_truth_gpair = f"{args.cache_dir}/liu_gpair.txt" if not os.path.exists(local_truth_gpair): s3_cache_files([REFERENCE_DIR + "/liu_gpair.txt"], args.cache_dir) listOfFiles = list() for (dirpath, dirnames, filenames) in os.walk(args.starfusion_dir): listOfFiles += [os.path.join(dirpath, file) for file in filenames] # get all relevant starfusion result files filtered_results = [ f for f in listOfFiles if f.find("star-fusion.fusion_predictions.abridged.tsv") != -1 ] # compare with truth set true_gpairs = read_fusion_pair(local_truth_gpair) npos = len(true_gpairs) for f in filtered_results: print(f) results = read_fusion_pair(f) npredict = len(results) tp = len(true_gpairs.intersection(results)) fn = npos - tp fp = npredict - tp precision = tp / (tp + fp) recall = tp / npos f1 = 2 * precision * recall / (precision + recall) f1 = "%.3f" % (f1) print(f"tp={tp}, fn={fn}, fp={fp}, f1={f1}")
def main() -> None: logging.basicConfig( level=logging.DEBUG, format="%(asctime)s:%(levelname)s: %(message)s" ) p = argparse.ArgumentParser() p.add_argument( "--cache_dir", default=util.DEFAULT_CACHE_DIR, help="Benchmark cache dir" ) p.add_argument( "--result_dir", default=util.DEFAULT_RESULT_DIR, help="Benchmark result dir" ) p.add_argument( "--starfusion_data_dir", default="/scratch-nvme/starfusion", help="Directory for expanding starfusion plug-n-play files", ) p.add_argument( "--starfusion_plug_n_play_targz", default=os.environ["HOME"] + "/GRCh38_v27_CTAT_lib_Feb092018.plug-n-play.tar.gz", help="Tar.gz file of starfusion plug-n-play file. https://github.com/STAR-Fusion/STAR-Fusion/wiki#data-resources-required", ) p.add_argument( "--starfusion_targz", default=os.environ["HOME"] + "/STAR-Fusion-v1.5.0.FULL.tar.gz", help="Tar.gz file of starfusion source package. https://github.com/STAR-Fusion/STAR-Fusion/wiki#data-resources-required", ) args = p.parse_args() for sample in util.SIMULATED_SAMPLES: util.s3_cache_files([sample.path.r1, sample.path.r2], args.cache_dir) fastq_files: List[str] = [] cached_file_pairs: List[util.FASTQPair] = [] fastq_files += [sample.path.r1, sample.path.r2] cached_file_pairs.append( util.FASTQPair( r1=args.cache_dir + "/" + os.path.basename(sample.path.r1), r2=args.cache_dir + "/" + os.path.basename(sample.path.r2), ) ) print(cached_file_pairs) sample_name = str(sample.n) + "_" + str(sample.coverage) run_starfusion(sample_name, cached_file_pairs, args)
def run_af4(sample_name: str, cached_file_pairs: List[util.FASTQPair], args: Any): ref_path = "s3://grail-publications/resources/gencode.v26.whole_genes.fa" cosmic_fusion_path = "s3://grail-publications/resources/all_pair_art_lod_gpair_merged.txt" util.s3_cache_files([ref_path, cosmic_fusion_path], args.cache_dir) cached_r1 = ",".join([ args.cache_dir + '/' + os.path.basename(fp.r1) for fp in cached_file_pairs ]) cached_r2 = ",".join([ args.cache_dir + '/' + os.path.basename(fp.r2) for fp in cached_file_pairs ]) for mode in ['denovo', 'targeted']: result_dir = args.result_dir + '/' + os.path.basename(sample_name + '-' + mode) if os.path.exists(result_dir + "/filtered.fa"): logging.info('Skipping benchmark: %s', result_dir) continue logging.info('Start af4 benchmark: %s', result_dir) try: os.makedirs(result_dir, 0o755) except: logging.error("mkdir %s failed", result_dir) af4_args = [ str(util.af4_path()), f'-log_dir={result_dir}', f'-pprof=:12345', f'-mutex-profile-rate=1000', f'-block-profile-rate=1000', f'-umi-in-read', f'-r1={cached_r1}', f'-r2={cached_r2}', f'-fasta-output={result_dir}/all.fa', f'-filtered-output={result_dir}/filtered.fa', f'-transcript={args.cache_dir}/gencode.v26.250padded_separate_jns_transcripts_parsed_no_mt_no_overlap_no_pary_no_versioned.fa' ] if mode == 'targeted': af4_args.append( f'-cosmic-fusion={args.cache_dir}/all_pair_art_lod_gpair_merged.txt' ) util.check_call(af4_args) logging.info('Finished benchmark: %s', result_dir) logging.info("Runtime stats: %s", util.run_stats(Path(result_dir)))
def main() -> None: logging.basicConfig(level=logging.DEBUG, format="%(asctime)s:%(levelname)s: %(message)s") p = argparse.ArgumentParser() p.add_argument("--cache_dir", default=util.DEFAULT_CACHE_DIR, help="Benchmark cache dir") p.add_argument("--result_dir", default=util.DEFAULT_RESULT_DIR, help="Benchmark result dir") p.add_argument( "--starfusion_data_dir", default="/scratch-nvme/starfusion", help="Directory for expanding starfusion plug-n-play files", ) p.add_argument( "--run", action="append", choices=["af4", "starfusion"], help="List of systems to run. If unset, run all the configured systems", ) p.add_argument( "--starfusion_plug_n_play_targz", default=os.environ["HOME"] + "/GRCh38_v27_CTAT_lib_Feb092018.plug-n-play.tar.gz", help= "Tar.gz file of starfusion plug-n-play file. https://github.com/STAR-Fusion/STAR-Fusion/wiki#data-resources-required", ) p.add_argument( "--starfusion_targz", default=os.environ["HOME"] + "/STAR-Fusion-v1.5.0.FULL.tar.gz", help= "Tar.gz file of starfusion source package. https://github.com/STAR-Fusion/STAR-Fusion/wiki#data-resources-required", ) p.add_argument( "--brca_data_dir", default="/scratch-nvme/xyang/brca_rnaseq_data", help="BT474, KPL4, MCF7, SKBR3 Breast cancer data directory", ) args = p.parse_args() if not args.run: args.run = ["af4", "starfusion"] ## brca rna-seq for af4 brca_samples = [ os.path.join(args.brca_data_dir, s) for s in ["BT474", "KPL4", "MCF7", "SKBR3"] ] for s in brca_samples: if not os.path.exists(os.path.join(args.brca_data_dir, s)): util.check_call([ "download_brca_data.py", "--odir", "/scratch-nvme/xyang/brca_rnaseq_data", ]) cosmic_fusion_path = ( "s3://grail-publications/2019-ISMB/references/all_art_lod_brca.txt") for sample in brca_samples: r1s: List[str] = [] for fq in os.listdir(sample): if "_1" in fq: r1s.append(os.path.join(sample, fq)) cached_file_pairs: List[util.FASTQPair] = [] for r1 in r1s: assert os.path.exists(r1.replace("_1", "_2")) cached_file_pairs.append( util.FASTQPair(r1=r1, r2=r1.replace("_1", "_2"))) print(os.path.basename(sample)) print(cached_file_pairs) run_af4(os.path.basename(sample), cached_file_pairs, cosmic_fusion_path, args) ## cfrna for af4 and starfusion cosmic_fusion_path = ( "s3://grail-publications/2019-ISMB/references/all_pair_art_lod_gpair_merged.txt" ) for sample in util.RNA_SAMPLES: fastq_files: List[str] = [] cached_file_pairs: List[util.FASTQPair] = [] for fp in sample.paths: assert fp.r1.replace("R1", "R2") == fp.r2, fp.r2 fastq_files += [fp.r1, fp.r2] cached_file_pairs.append( util.FASTQPair( r1=args.cache_dir + "/" + os.path.basename(fp.r1), r2=args.cache_dir + "/" + os.path.basename(fp.r2), )) util.s3_cache_files(fastq_files, args.cache_dir) if "af4" in args.run: run_af4(sample.name, cached_file_pairs, cosmic_fusion_path, args) if "starfusion" in args.run: run_starfusion(sample.name, cached_file_pairs, args)
def main() -> None: logging.basicConfig(level=logging.DEBUG, format="%(asctime)s:%(levelname)s: %(message)s") p = argparse.ArgumentParser() p.add_argument('--cache_dir', default=util.DEFAULT_CACHE_DIR, help='Benchmark cache dir') p.add_argument('--result_dir', default=util.DEFAULT_RESULT_DIR, help='Benchmark result dir') p.add_argument( '--rerun_af4', action='store_true', help='Always run AF4 even if the result file already exists') p.add_argument( '--recache_files', action='store_true', help= 'Always copy benchmark data files, even if they already exist locally.' ) args = p.parse_args() util.s3_cache_files([ util.REFERENCE_DIR + '/gencode.v26.250padded_separate_jns_transcripts_parsed_no_mt_no_overlap_no_pary_no_versioned.fa', util.REFERENCE_DIR + '/all_pair_art_lod_gpair_merged.txt', util.REFERENCE_DIR + '/liu_gpair.txt' ], args.cache_dir) for mode in ['denovo', 'targeted']: for sample in util.SIMULATED_SAMPLES: util.s3_cache_files([sample.path.r1, sample.path.r2], args.cache_dir) result_dir = f'{args.result_dir}/synthetic-{mode}-{sample.n}-{sample.coverage}' try: os.makedirs(result_dir, 0o755) except: logging.error("mkdir %s failed", result_dir) if not os.path.exists( f'{result_dir}/filtered.fa') or args.rerun_af4: logging.info('running benchmark in %s', result_dir) af4_args = [ str(util.af4_path()), f'-log_dir={result_dir}', f'-r1={args.cache_dir}/{sample.path.r1}', f'-r2={args.cache_dir}/{sample.path.r2}', f'-fasta-output={result_dir}/all.fa', f'-filtered-output={result_dir}/filtered.fa', '-transcript=' + args.cache_dir + '/gencode.v26.250padded_separate_jns_transcripts_parsed_no_mt_no_overlap_no_pary_no_versioned.fa' ] if mode == 'targeted': af4_args.append('-cosmic-fusion=' + args.cache_dir + '/all_pair_art_lod_gpair_merged.txt') util.check_call(af4_args) logging.info("Runtime stats: %s", util.run_stats(Path(result_dir))) stats = TargetedFusionStats( Path(f'{args.cache_dir}/liu_gpair.txt'), Path(f'{result_dir}/filtered.fa')) s = stats.stats() tp = "%d" % (s.tp, ) fp = "%d" % (s.fp, ) fn = "%d" % (s.fn, ) print( f'{mode} & {sample.n} & {sample.coverage} & {tp} & {fp} & {fn}\\\\' )
def main() -> None: logging.basicConfig(level=logging.DEBUG, format="%(asctime)s:%(levelname)s: %(message)s") p = argparse.ArgumentParser() p.add_argument("--cache_dir", default=util.DEFAULT_CACHE_DIR, help="Benchmark cache dir") p.add_argument("--result_dir", default=util.DEFAULT_RESULT_DIR, help="Benchmark result dir") p.add_argument( "--rerun_af4", action="store_true", help="Always run AF4 even if the result file already exists", ) p.add_argument( "--recache_files", action="store_true", help= "Always copy benchmark data files, even if they already exist locally.", ) args = p.parse_args() util.s3_cache_files( [ util.REFERENCE_DIR + "/gencode.v26.250padded_separate_jns_transcripts_parsed_no_mt_no_overlap_no_pary_no_versioned.fa", util.REFERENCE_DIR + "/all_pair_art_lod_gpair_merged.txt", util.REFERENCE_DIR + "/liu_gpair.txt", ], args.cache_dir, ) for mode in ["denovo", "targeted"]: for sample in util.SIMULATED_SAMPLES: util.s3_cache_files([sample.path.r1, sample.path.r2], args.cache_dir) result_dir = ( f"{args.result_dir}/synthetic-{mode}-{sample.n}-{sample.coverage}" ) try: os.makedirs(result_dir, 0o755) except: logging.error("mkdir %s failed", result_dir) if not os.path.exists( f"{result_dir}/filtered.fa") or args.rerun_af4: logging.info("running benchmark in %s", result_dir) af4_args = [ str(util.af4_path()), f"-log_dir={result_dir}", f"-r1={args.cache_dir}/{sample.path.r1}", f"-r2={args.cache_dir}/{sample.path.r2}", f"-fasta-output={result_dir}/all.fa", f"-filtered-output={result_dir}/filtered.fa", f"-max-genes-per-kmer=2", f"-max-proximity-distance=1000", f"-max-proximity-genes=5", "-transcript=" + args.cache_dir + "/gencode.v26.250padded_separate_jns_transcripts_parsed_no_mt_no_overlap_no_pary_no_versioned.fa", ] if mode == "targeted": af4_args.append("-cosmic-fusion=" + args.cache_dir + "/all_pair_art_lod_gpair_merged.txt") util.check_call(af4_args) logging.info("Runtime stats: %s", util.run_stats(Path(result_dir))) stats = TargetedFusionStats( Path(f"{args.cache_dir}/liu_gpair.txt"), Path(f"{result_dir}/filtered.fa"), ) s = stats.stats() tp = "%d" % (s.tp, ) fp = "%d" % (s.fp, ) fn = "%d" % (s.fn, ) print( f"{mode} & {sample.n} & {sample.coverage} & {tp} & {fp} & {fn}\\\\" )