def main(): parser = argparse.ArgumentParser('Broad cocleaning (Inderrealignment and BQSR) pipeline') # Logging flags. parser.add_argument('-d', '--debug', action = 'store_const', const = logging.DEBUG, dest = 'level', help = 'Enable debug logging.', ) parser.set_defaults(level = logging.INFO) # Required flags. parser.add_argument('-r', '--reference_fasta_name', required = True, help = 'Reference fasta path.', ) parser.add_argument('-indel','--known_1k_genome_indel_sites', required=True, help='Reference INDEL path.', ) parser.add_argument('-snp','--dbsnp_known_snp_sites', required=True, help='Reference SNP path.', ) parser.add_argument('-b', '--harmonized_bam_path', required = False, action="append", help = 'Source bam path.', ) parser.add_argument('-list', '--harmonized_bam_list_path', required = False, help = 'Source bam list path.', ) parser.add_argument('-s', '--scratch_dir', required = False, type = is_dir, help = 'Scratch file directory.', ) parser.add_argument('-l', '--log_dir', required = False, type = is_dir, help = 'Log file directory.', ) parser.add_argument('-j', '--thread_count', required = True, type = is_nat, help = 'Maximum number of threads for execution.', ) parser.add_argument('-u', '--uuid', required = True, help = 'analysis_id string', ) parser.add_argument('-m', '--md5', required = False, action = 'store_true', help = 'calculate final size/MD5', ) parser.add_argument('-e', '--eliminate_intermediate_files', required = False, action = 'store_true', help = 'do not (really) reduce disk usage. set if you want to use more disk space!' ) args = parser.parse_args() reference_fasta_name = args.reference_fasta_name known_1k_genome_indel_sites = args.known_1k_genome_indel_sites dbsnp_known_snp_sites = args.dbsnp_known_snp_sites uuid = args.uuid harmonized_bam_path = args.harmonized_bam_path if not args.harmonized_bam_list_path: list_dir = os.path.dirname(harmonized_bam_path[0]) harmonized_bam_list_path = os.path.join(list_dir, uuid + '_harmonized_bam_list.list') with open(harmonized_bam_list_path, "w") as handle: for bam in harmonized_bam_path: handle.write(bam + "\n") else: harmonized_bam_list_path = args.harmonized_bam_list_path if not args.scratch_dir: scratch_dir = os.path.dirname(harmonized_bam_list_path) else: scratch_dir = args.scratch_dir if not args.log_dir: log_dir = os.path.dirname(harmonized_bam_list_path) else: log_dir = args.log_dir thread_count = str(args.thread_count) if not args.eliminate_intermediate_files: eliminate_intermediate_files = True else: eliminate_intermediate_files = False if not args.md5: md5 = False else: md5 = True ##logging logging.basicConfig( filename=os.path.join(log_dir, 'Broad_cocleaning_' + uuid + '.log'), # /host for docker level=args.level, filemode='a', format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d_%H:%M:%S_%Z', ) logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) logger = logging.getLogger(__name__) hostname = os.uname()[1] logger.info('hostname=%s' % hostname) logger.info('harmonized_bam_list_path=%s' % harmonized_bam_list_path) if not args.harmonized_bam_path: with open(harmonized_bam_list_path) as f: harmonized_bam_path = f.read().splitlines() for path in harmonized_bam_path: logger.info('harmonized_bam_path=%s' % path) else: for path in harmonized_bam_path: logger.info('harmonized_bam_path=%s' % path) engine_path = 'sqlite:///' + os.path.join(log_dir, uuid + '_Broad_cocleaning.db') engine = sqlalchemy.create_engine(engine_path, isolation_level='SERIALIZABLE') ##Pipeline #check .bai file, call samtools index if not exist RealignerTargetCreator.index(uuid, harmonized_bam_list_path, engine, logger) #call RealignerTargetCreator for harmonized bam list harmonized_bam_intervals_path = RealignerTargetCreator.RTC(uuid, harmonized_bam_list_path, thread_count, reference_fasta_name, known_1k_genome_indel_sites, engine, logger) #call IndelRealigner together but save the reads in the output coresponding to the input that the read came from. harmonized_IR_bam_list_path = IndelRealigner.IR(uuid, harmonized_bam_list_path, reference_fasta_name, known_1k_genome_indel_sites, harmonized_bam_intervals_path, engine, logger) #call BQSR table individually and apply it on bam Analysis_ready_bam_list_path = [] for bam in harmonized_IR_bam_list_path: harmonized_IR_bam_BQSR_table_path = BaseRecalibrator.BQSR(uuid, bam, thread_count, reference_fasta_name, dbsnp_known_snp_sites, engine, logger) Analysis_ready_bam_path = PrintReads.PR(uuid, bam, thread_count, reference_fasta_name, harmonized_IR_bam_BQSR_table_path, engine, logger) bam_validate.bam_validate(uuid, Analysis_ready_bam_path, engine, logger) Analysis_ready_bam_list_path.append(Analysis_ready_bam_path) if md5: for bam in Analysis_ready_bam_list_path: bam_name = os.path.basename(bam) bam_dir = os.path.dirname(bam) bam_basename, bam_ext = os.path.splitext(bam_name) bai_name = bam_basename + '.bai' bai_path = os.path.join(bam_dir, bai_name) verify_util.store_md5_size(uuid, bam, engine, logger) verify_util.store_md5_size(uuid, bai_path, engine, logger) if eliminate_intermediate_files: pipe_util.remove_file_list(uuid, harmonized_IR_bam_list_path, engine, logger) for bam in Analysis_ready_bam_list_path: validate_file = bam_validate.bam_validate(uuid, bam, engine, logger)
def main(): parser = argparse.ArgumentParser('MuSE variant calling pipeline') # Logging flags. parser.add_argument('-d', '--debug', action = 'store_const', const = logging.DEBUG, dest = 'level', help = 'Enable debug logging.', ) parser.set_defaults(level = logging.INFO) # Required flags. parser.add_argument('-r', '--reference_fasta_name', required = True, help = 'Reference fasta path.', ) parser.add_argument('-snp','--dbsnp_known_snp_sites', required=True, help='Reference SNP path, that should be bgzip compressed, tabix indexed', ) parser.add_argument('-tb', '--analysis_ready_tumor_bam_path', required = True, nargs = '?', default = [sys.stdin], help = 'Source patient tumor bam path.', ) parser.add_argument('-nb', '--analysis_ready_normal_bam_path', required = True, nargs = '?', default = [sys.stdin], help = 'Source patient normal bam path.', ) parser.add_argument('-g', '--Whole_genome_squencing_data', required = False, action = 'store_true', help = 'Whole genome squencing data', ) parser.add_argument('-bs', '--Parallel_Block_Size', type = is_nat, default = 50000000, help = 'Parallel Block Size', ) parser.add_argument('-s', '--scratch_dir', required = False, type = is_dir, help = 'Scratch file directory.', ) parser.add_argument('-l', '--log_dir', required = False, type = is_dir, help = 'Log file directory.', ) parser.add_argument('-j', '--thread_count', required = True, type = is_nat, help = 'Maximum number of threads for execution.', ) parser.add_argument('-u', '--uuid', required = True, help = 'analysis_id string', ) parser.add_argument('-m', '--md5', required = False, action = 'store_true', help = 'calculate final size/MD5', ) parser.add_argument('-e', '--eliminate_intermediate_files', required = False, action = 'store_true', help = 'do not (really) reduce disk usage. set if you want to use more disk space!' ) args = parser.parse_args() reference_fasta_name = args.reference_fasta_name dbsnp_known_snp_sites = args.dbsnp_known_snp_sites uuid = args.uuid analysis_ready_tumor_bam_path = args.analysis_ready_tumor_bam_path analysis_ready_normal_bam_path = args.analysis_ready_normal_bam_path blocksize = args.Parallel_Block_Size if not args.scratch_dir: scratch_dir = os.path.dirname(analysis_ready_tumor_bam_path) else: scratch_dir = args.scratch_dir if not args.log_dir: log_dir = os.path.dirname(analysis_ready_tumor_bam_path) else: log_dir = args.log_dir thread_count = str(args.thread_count) if not args.eliminate_intermediate_files: eliminate_intermediate_files = True else: eliminate_intermediate_files = False if not args.md5: md5 = False else: md5 = True ##logging logging.basicConfig( filename=os.path.join(log_dir, 'MuSE_variant_calling' + uuid + '.log'), # /host for docker level=args.level, filemode='a', format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d_%H:%M:%S_%Z', ) logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) logger = logging.getLogger(__name__) hostname = os.uname()[1] logger.info('hostname=%s' % hostname) logger.info('analysis_ready_tumor_bam_path=%s' % analysis_ready_tumor_bam_path) logger.info('analysis_ready_normal_bam_path=%s' % analysis_ready_normal_bam_path) engine_path = 'sqlite:///' + os.path.join(log_dir, uuid + '_MuSE_variant_calling.db') engine = sqlalchemy.create_engine(engine_path, isolation_level='SERIALIZABLE') ##Pipeline #faidx reference fasta file if needed. fai_path = reference_fasta_name + '.fai' if os.path.isfile(fai_path): logger.info('reference_fai_path=%s' % fai_path) else: fai_path = index_util.samtools_faidx(uuid, reference_fasta_name, engine, logger) logger.info('reference_fai_path=%s' % fai_path) #index input bam files if needed. bam_path = [] bam_path.extend([analysis_ready_tumor_bam_path, analysis_ready_normal_bam_path]) for path in bam_path: bai_path = path + '.bai' if os.path.isfile(bai_path): logger.info('analysis_ready_bam_bai_path=%s' % bai_path) else: bai_path = index_util.samtools_bam_index(uuid, path, engine, logger) logger.info('analysis_ready_bam_bai_path=%s' % bai_path) #bgzip compress and tabix index dbsnp file if needed. dbsnp_name, dbsnp_ext = os.path.splitext(dbsnp_known_snp_sites) dbsnp_tabix_path = dbsnp_known_snp_sites + '.tbi' if dbsnp_ext == '.bgz': logger.info('dbsnp file is already bgzip compressed =%s' % dbsnp_known_snp_sites) if os.path.isfile(dbsnp_tabix_path): logger.info('tabix index of dbsnp_bgz file =%s' % dbsnp_tabix_path) else: dbsnp_tabix_path = index_util.tabix_index(uuid, dbsnp_known_snp_sites, engine, logger) logger.info('tabix index of dbsnp_bgz file =%s' % dbsnp_tabix_path) else: dbsnp_known_snp_sites = index_util.bgzip_compress(uuid, dbsnp_known_snp_sites, engine, logger) logger.info('dbsnp file is already bgzip compressed =%s' % dbsnp_known_snp_sites) dbsnp_tabix_path = index_util.tabix_index(uuid, dbsnp_known_snp_sites, engine, logger) logger.info('tabix index of dbsnp_bgz file =%s' % dbsnp_tabix_path) #sys.exit('!!!Reference dbSNP file should be bgzip compressed!!!') #MuSE call muse_call_output_path = muse_call.call_region(uuid, thread_count, analysis_ready_tumor_bam_path, analysis_ready_normal_bam_path, reference_fasta_name, fai_path, blocksize, engine, logger) #MuSE sump if not args.Whole_genome_squencing_data: muse_vcf = muse_sump.sump_wxs(uuid, muse_call_output_path, dbsnp_known_snp_sites, engine, logger) else: muse_vcf = muse_sump.sump_wgs(uuid, muse_call_output_path, dbsnp_known_snp_sites, engine, logger) #picard sortvcf muse_srt_vcf = index_util.picard_sortvcf(uuid, muse_vcf, reference_fasta_name, engine, logger) if eliminate_intermediate_files: pipe_util.remove_file_list(uuid, [muse_call_output_path], engine, logger) pipe_util.remove_file_list(uuid, [muse_vcf], engine, logger) if md5: verify_util.store_md5_size(uuid, muse_srt_vcf, engine, logger)