def bwa(uuid, bam_path, reference_fasta_path, readgroup_path_dict, engine, logger): uuid_dir = os.path.dirname(bam_path) logger.info("uuid_dir=%s" % uuid_dir) fastq_dir = os.path.join(uuid_dir, "fastq") logger.info("fastq_dir=%s" % fastq_dir) realn_dir = os.path.join(uuid_dir, "realn") os.makedirs(realn_dir, exist_ok=True) fastqlist = fastq_util.buildfastqlist(fastq_dir, logger) logger.info("fastqlist=%s" % fastqlist) sefastqlist = fastq_util.buildsefastqlist(fastqlist) logger.info("sefastqlist=%s" % sefastqlist) bam_path_list = list() for seread in sefastqlist: seread_fastq_encoding = get_fastq_encoding_from_db(seread, fastq_dir, engine, logger) seread_fastq_length = get_fastq_length_from_db(seread, fastq_dir, engine, logger) rg_str = get_readgroup_str(seread, readgroup_path_dict, logger) bam_path = bwa_aln_single( uuid, bam_path, fastq_dir, seread, realn_dir, "s", reference_fasta_path, rg_str, seread_fastq_encoding, engine, logger, ) bam_path_list.append(bam_path) return bam_path_list
def bwa_aln(bam_path,reference_fasta_path,logger): uuid_dir=os.path.dirname(bam_path) fastq_dir=os.path.join(uuid_dir,'fastq') realn_dir=os.path.join(uuid_dir,'realn') os.makedirs(realn_dir,exist_ok=True) fastqlist=fastq_util.buildfastqlist(fastq_dir) pefastqdict=fastq_util.buildpefastqdict(fastqlist) logger.info('pefastqdict=%s' % pefastqdict) sefastqlist=fastq_util.buildsefastqlist(fastqlist) logger.info('sefastqlist=%s' % sefastqlist) o1fastqlist=fastq_util.buildo1fastqlist(fastqlist) logger.info('o1fastqlist=%s' % o1fastqlist) o2fastqlist=fastq_util.buildo2fastqlist(fastqlist) logger.info('o2fastqlist=%s' % o2fastqlist) bam_path_list=list() for read1 in sorted(pefastqdict.keys()): bam_path=bwa_aln_paired(bam_path,fastq_dir,read1,pefastqdict[read1],realn_dir,reference_fasta_path,logger) bam_path_list.append(bam_path) for seread in sefastqlist: bam_path=bwa_aln_single(bam_path,fastq_dir,seread,realn_dir,'s',reference_fasta_path,logger) bam_path_list.append(bam_path) for o1read in o1fastqlist: bam_path=bwa_aln_single(bam_path,fastq_dir,o1read,realn_dir,'o1',reference_fasta_path,logger) bam_path_list.append(bam_path) for o2read in o2fastqlist: bam_path=bwa_aln_single(bam_path,fastq_dir,o2read,realn_dir,'o2',reference_fasta_path,logger) bam_path_list.append(bam_path) return bam_path_list
def fastq_validate(uuid, fastq_dir, engine, logger): fastqlist = fastq_util.buildfastqlist(fastq_dir, logger) logger.info('fastqlist=%s' % fastqlist) sefastqlist = fastq_util.buildsefastqlist(fastqlist) for seread in sefastqlist: fq_path = os.path.join(fastq_dir, seread) fq_path_size = get_fastq_size(fq_path, logger) if fq_path_size > 0: do_fastqc(uuid, fq_path, engine, logger) fastqc_to_db(uuid, fq_path, engine, logger) return
def fastq_validate(uuid, fastq_dir, engine, logger): fastqlist = fastq_util.buildfastqlist(fastq_dir, logger) logger.info('fastqlist=%s' % fastqlist) sefastqlist = fastq_util.buildsefastqlist(fastqlist) for seread in sefastqlist: fq_path = os.path.join(fastq_dir, seread) fq_path_size = get_fastq_size(fq_path, logger) if fq_path_size > 0: do_fastqc(uuid, fq_path, engine, logger) fastqc_to_db(uuid, fq_path, engine, logger) return
def fastq_guess_encoding(uuid, fastq_dir, engine, logger): # future thread count logger.info('enter fastq_guess_encoding()') fastqlist = fastq_util.buildfastqlist(fastq_dir, logger) logging.info('fastqlist=%s' % fastqlist) sefastqlist = fastq_util.buildsefastqlist(fastqlist) logger.info('sefastqlist=%s' % sefastqlist) for seread in sefastqlist: fq_path = os.path.join(fastq_dir, seread) fq_path_size = get_fastq_size(fq_path, logger) if fq_path_size > 0: do_guess_encoding(uuid, fq_path, engine, logger) guess_enc_db(uuid, fq_path, engine, logger)
def fastq_guess_encoding(uuid, fastq_dir, engine, logger): # future thread count logger.info('enter fastq_guess_encoding()') fastqlist = fastq_util.buildfastqlist(fastq_dir, logger) logging.info('fastqlist=%s' % fastqlist) sefastqlist = fastq_util.buildsefastqlist(fastqlist) logger.info('sefastqlist=%s' % sefastqlist) for seread in sefastqlist: fq_path = os.path.join(fastq_dir, seread) fq_path_size = get_fastq_size(fq_path, logger) if fq_path_size > 0: do_guess_encoding(uuid, fq_path, engine, logger) guess_enc_db(uuid, fq_path, engine, logger)
def fastqc_validate(uuid,bam_path,thread_count,engine,logger): uuid_dir=os.path.dirname(bam_path) fastq_dir=os.path.join(uuid_dir,'fastq') fastqlist=fastq_util.buildfastqlist(fastq_dir) logging.info('fastqlist=%s' % fastqlist) pefastqdict=fastq_util.buildpefastqdict(fastqlist) logger.info('pefastqdict=%s' % pefastqdict) sefastqlist=fastq_util.buildsefastqlist(fastqlist) logger.info('sefastqlist=%s' % sefastqlist) o1fastqlist=fastq_util.buildo1fastqlist(fastqlist) logger.info('o1fastqlist=%s' % o1fastqlist) o2fastqlist=fastq_util.buildo2fastqlist(fastqlist) logger.info('o2fastqlist=%s' % o2fastqlist) for read1 in sorted(pefastqdict.keys()): #read1 fq_path=os.path.join(fastq_dir,read1) do_fastqc(uuid,fq_path,thread_count,engine,logger) fastqc_to_db(uuid,fq_path,engine,logger) do_guess_encoding(uuid,fq_path,engine,logger) guess_enc_db(uuid,fq_path,engine,logger) fastq_length=fastq_util.get_fastq_length(uuid,fastq_dir,read1,engine,logger) # removable #read2 fq_path=os.path.join(fastq_dir,pefastqdict[read1]) do_fastqc(uuid,fq_path,thread_count,engine,logger) fastqc_to_db(uuid,fq_path,engine,logger) do_guess_encoding(uuid,fq_path,engine,logger) guess_enc_db(uuid,fq_path,engine,logger) for seread in sefastqlist: fq_path=os.path.join(fastq_dir,seread) do_fastqc(uuid,fq_path,thread_count,engine,logger) fastqc_to_db(uuid,fq_path,engine,logger) do_guess_encoding(uuid,fq_path,engine,logger) guess_enc_db(uuid,fq_path,engine,logger) fastq_length=fastq_util.get_fastq_length(uuid,fastq_dir,seread,engine,logger) # removable for o1read in o1fastqlist: fq_path=os.path.join(fastq_dir,o1read) do_fastqc(uuid,fq_path,thread_count,engine,logger) fastqc_to_db(uuid,fq_path,engine,logger) do_guess_encoding(uuid,fq_path,engine,logger) guess_enc_db(uuid,fq_path,engine,logger) fastq_length=fastq_util.get_fastq_length(uuid,fastq_dir,seread,engine,logger) # removable for o2read in o2fastqlist: fq_path=os.path.join(fastq_dir,o2read) do_fastqc(uuid,fq_path,thread_count,engine,logger) fastqc_to_db(uuid,fq_path,engine,logger) do_guess_encoding(uuid,fq_path,engine,logger) guess_enc_db(uuid,fq_path,engine,logger) fastq_length=fastq_util.get_fastq_length(uuid,fastq_dir,seread,engine,logger) # removable return fastq_length
def bwa(uuid, bam_path, reference_fasta_path, readgroup_path_dict, engine, logger): uuid_dir = os.path.dirname(bam_path) logger.info('uuid_dir=%s' % uuid_dir) fastq_dir = os.path.join(uuid_dir, 'fastq') logger.info('fastq_dir=%s' % fastq_dir) realn_dir = os.path.join(uuid_dir, 'realn') os.makedirs(realn_dir, exist_ok=True) fastqlist = fastq_util.buildfastqlist(fastq_dir, logger) logger.info('fastqlist=%s' % fastqlist) sefastqlist = fastq_util.buildsefastqlist(fastqlist) logger.info('sefastqlist=%s' % sefastqlist) bam_path_list = list() for seread in sefastqlist: seread_fastq_encoding = get_fastq_encoding_from_db( seread, fastq_dir, engine, logger) seread_fastq_length = get_fastq_length_from_db(seread, fastq_dir, engine, logger) rg_str = get_readgroup_str(seread, readgroup_path_dict, logger) bam_path = bwa_aln_single(uuid, bam_path, fastq_dir, seread, realn_dir, 's', reference_fasta_path, rg_str, seread_fastq_encoding, engine, logger) bam_path_list.append(bam_path) return bam_path_list
def main(): parser = argparse.ArgumentParser('miRNA harmonization') # Logging flag parser.add_argument('-d', '--debug', action = 'store_const', const = logging.DEBUG, dest = 'level', help = 'Enable debug logging.', ) parser.set_defaults(level = logging.INFO) # Required flags parser.add_argument('-r', '--reference_fasta_path', required = False, help = 'Reference fasta path.', ) parser.add_argument('-b', '--bam_path', nargs = '?', default = [sys.stdin], help = 'Source bam path.', ) parser.add_argument('-l', '--log_dir', required = False, type = is_dir, help = 'Log file directory.', ) parser.add_argument('-u', '--uuid', required = False, help = 'analysis_id string', ) args = parser.parse_args() reference_fasta_path = args.reference_fasta_path preharmonized_bam_path = args.bam_path log_dir = args.log_dir uuid = args.uuid # Logging Setup logging.basicConfig( filename = os.path.join(log_dir, 'aln_' + uuid + '.log'), filemode = 'a', level = args.level, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d_%H:%M:%S_%Z', ) logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) logger = logging.getLogger(__name__) hostname = os.uname()[1] logger.info('hostname=%s' % hostname) logger.info('preharmonized_bam_path=%s' % preharmonized_bam_path) engine_path = 'sqlite:///' + os.path.join(log_dir, uuid + '_harmonize.db') engine = sqlalchemy.create_engine(engine_path, isolation_level='SERIALIZABLE') bam_validate.bam_validate(uuid, preharmonized_bam_path, engine, logger) bam_stats.do_samtools_flagstat(uuid, preharmonized_bam_path, reference_fasta_path, engine, logger) readgroup_path_dict = bam_util.write_readgroups(uuid, preharmonized_bam_path, engine, logger) bam_util.bam_to_fastq(uuid, preharmonized_bam_path, engine, logger) top_dir = os.path.dirname(preharmonized_bam_path) fastq_dir = os.path.join(top_dir, 'fastq') fastq_validate.fastq_validate(uuid, fastq_dir, engine, logger) # Harmonization be_lenient = False harmonized_readgroup_bam_path_list = bam_util.bwa(uuid, preharmonized_bam_path, reference_fasta_path, readgroup_path_dict, engine, logger) fastq_list = fastq_util.buildfastqlist(fastq_dir, logger) fastq_path_list = [os.path.join(fastq_dir, fastq) for fastq in fastq_list] for harmonized_readgroup_bam_path in harmonized_readgroup_bam_path_list: if pipe_util.is_aln_bam(harmonized_readgroup_bam_path, logger): be_lenient = True harmonized_sorted_bam_path_list = picard_bam_sort.bam_sort(uuid, preharmonized_bam_path, harmonized_readgroup_bam_path_list, reference_fasta_path, engine, logger, be_lenient) harmonized_bam_merge_path = picard_bam_merge.bam_merge(uuid, preharmonized_bam_path, harmonized_sorted_bam_path_list, engine, logger, be_lenient) bam_validate.bam_validate(uuid, harmonized_bam_merge_path, engine, logger) bam_stats.do_samtools_flagstat(uuid, harmonized_bam_merge_path, reference_fasta_path, engine, logger)
def bwa_mem(uuid, bam_path, reference_fasta_path, readgroup_path_dict, thread_count, engine, logger): uuid_dir = os.path.dirname(bam_path) logger.info("uuid_dir=%s" % uuid_dir) fastq_dir = os.path.join(uuid_dir, "fastq") logger.info("fastq_dir=%s" % fastq_dir) realn_dir = os.path.join(uuid_dir, "realn") logger.info("realn_dir=%s" % realn_dir) os.makedirs(realn_dir, exist_ok=True) fastqlist = fastq_util.buildfastqlist(fastq_dir) logging.info("fastqlist=%s" % fastqlist) pefastqdict = fastq_util.buildpefastqdict(fastqlist) logger.info("pefastqdict=%s" % pefastqdict) sefastqlist = fastq_util.buildsefastqlist(fastqlist) logger.info("sefastqlist=%s" % sefastqlist) o1fastqlist = fastq_util.buildo1fastqlist(fastqlist) logger.info("o1fastqlist=%s" % o1fastqlist) o2fastqlist = fastq_util.buildo2fastqlist(fastqlist) logger.info("o2fastqlist=%s" % o2fastqlist) bam_path_list = list() for read1 in sorted(pefastqdict.keys()): rg_str = bam_util.get_readgroup_str(read1, readgroup_path_dict, logger) bam_path = bwa_mem_paired( uuid, bam_path, fastq_dir, read1, pefastqdict[read1], realn_dir, reference_fasta_path, rg_str, thread_count, engine, logger, ) bam_path_list.append(bam_path) for seread in sefastqlist: rg_str = bam_util.get_readgroup_str(seread, readgroup_path_dict, logger) bam_path = bwa_mem_single( uuid, bam_path, fastq_dir, seread, realn_dir, "s", reference_fasta_path, rg_str, thread_count, engine, logger, ) bam_path_list.append(bam_path) for o1read in o1fastqlist: rg_str = bam_util.get_readgroup_str(o1read, readgroup_path_dict, logger) bam_path = bwa_mem_single( uuid, bam_path, fastq_dir, o1read, realn_dir, "o1", reference_fasta_path, rg_str, thread_count, engine, logger, ) bam_path_list.append(bam_path) for o2read in o2fastqlist: rg_str = bam_util.get_readgroup_str(o2read, readgroup_path_dict, logger) bam_path = bwa_mem_single( uuid, bam_path, fastq_dir, o2read, realn_dir, "o2", reference_fasta_path, rg_str, thread_count, engine, logger, ) bam_path_list.append(bam_path) return bam_path_list
def main(): parser = argparse.ArgumentParser('miRNA harmonization') # Logging flag parser.add_argument( '-d', '--debug', action='store_const', const=logging.DEBUG, dest='level', help='Enable debug logging.', ) parser.set_defaults(level=logging.INFO) # Required flags parser.add_argument( '-r', '--reference_fasta_path', required=False, help='Reference fasta path.', ) parser.add_argument( '-b', '--bam_path', nargs='?', default=[sys.stdin], help='Source bam path.', ) parser.add_argument( '-l', '--log_dir', required=False, type=is_dir, help='Log file directory.', ) parser.add_argument( '-u', '--uuid', required=False, help='analysis_id string', ) args = parser.parse_args() reference_fasta_path = args.reference_fasta_path preharmonized_bam_path = args.bam_path log_dir = args.log_dir uuid = args.uuid # Logging Setup logging.basicConfig( filename=os.path.join(log_dir, 'aln_' + uuid + '.log'), filemode='a', level=args.level, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d_%H:%M:%S_%Z', ) logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) logger = logging.getLogger(__name__) hostname = os.uname()[1] logger.info('hostname=%s' % hostname) logger.info('preharmonized_bam_path=%s' % preharmonized_bam_path) engine_path = 'sqlite:///' + os.path.join(log_dir, uuid + '_harmonize.db') engine = sqlalchemy.create_engine(engine_path, isolation_level='SERIALIZABLE') bam_validate.bam_validate(uuid, preharmonized_bam_path, engine, logger) bam_stats.do_samtools_flagstat(uuid, preharmonized_bam_path, reference_fasta_path, engine, logger) readgroup_path_dict = bam_util.write_readgroups(uuid, preharmonized_bam_path, engine, logger) bam_util.bam_to_fastq(uuid, preharmonized_bam_path, engine, logger) top_dir = os.path.dirname(preharmonized_bam_path) fastq_dir = os.path.join(top_dir, 'fastq') fastq_validate.fastq_validate(uuid, fastq_dir, engine, logger) # Harmonization be_lenient = False harmonized_readgroup_bam_path_list = bam_util.bwa(uuid, preharmonized_bam_path, reference_fasta_path, readgroup_path_dict, engine, logger) fastq_list = fastq_util.buildfastqlist(fastq_dir, logger) fastq_path_list = [os.path.join(fastq_dir, fastq) for fastq in fastq_list] for harmonized_readgroup_bam_path in harmonized_readgroup_bam_path_list: if pipe_util.is_aln_bam(harmonized_readgroup_bam_path, logger): be_lenient = True harmonized_sorted_bam_path_list = picard_bam_sort.bam_sort( uuid, preharmonized_bam_path, harmonized_readgroup_bam_path_list, reference_fasta_path, engine, logger, be_lenient) harmonized_bam_merge_path = picard_bam_merge.bam_merge( uuid, preharmonized_bam_path, harmonized_sorted_bam_path_list, engine, logger, be_lenient) bam_validate.bam_validate(uuid, harmonized_bam_merge_path, engine, logger) bam_stats.do_samtools_flagstat(uuid, harmonized_bam_merge_path, reference_fasta_path, engine, logger)
def bwa(uuid,bam_path,reference_fasta_path,readgroup_path_dict,thread_count,engine,logger): uuid_dir=os.path.dirname(bam_path) logger.info('uuid_dir=%s' % uuid_dir) fastq_dir=os.path.join(uuid_dir,'fastq') logger.info('fastq_dir=%s' % fastq_dir) realn_dir=os.path.join(uuid_dir,'realn') logger.info('realn_dir=%s' % realn_dir) os.makedirs(realn_dir,exist_ok=True) fastqlist=fastq_util.buildfastqlist(fastq_dir) logging.info('fastqlist=%s' % fastqlist) pefastqdict=fastq_util.buildpefastqdict(fastqlist) logger.info('pefastqdict=%s' % pefastqdict) sefastqlist=fastq_util.buildsefastqlist(fastqlist) logger.info('sefastqlist=%s' % sefastqlist) o1fastqlist=fastq_util.buildo1fastqlist(fastqlist) logger.info('o1fastqlist=%s' % o1fastqlist) o2fastqlist=fastq_util.buildo2fastqlist(fastqlist) logger.info('o2fastqlist=%s' % o2fastqlist) bam_path_list=list() for read1 in sorted(pefastqdict.keys()): rg_str=bam_util.get_readgroup_str(read1,readgroup_path_dict,logger) fastq_length=fastq_util.get_fastq_length(uuid,fastq_dir,read1,engine,logger) if fastq_length<MEM_ALN_CUTOFF: bam_path=bwa_aln_paired(uuid,bam_path,fastq_dir,read1,pefastqdict[read1],realn_dir,reference_fasta_path, rg_str,thread_count,engine,logger) bam_path_list.append(bam_path) else: bam_path=bwa_mem_paired(uuid,bam_path,fastq_dir,read1,pefastqdict[read1],realn_dir,reference_fasta_path, rg_str,thread_count,engine,logger) bam_path_list.append(bam_path) for seread in sefastqlist: rg_str=bam_util.get_readgroup_str(seread,readgroup_path_dict,logger) fastq_length=fastq_util.get_fastq_length(uuid,fastq_dir,seread,engine,logger) if fastq_length<MEM_ALN_CUTOFF: bam_path=bwa_aln_single(uuid,bam_path,fastq_dir,seread,realn_dir,'s',reference_fasta_path, rg_str,thread_count,engine,logger) bam_path_list.append(bam_path) else: bam_path=bwa_mem_single(uuid,bam_path,fastq_dir,seread,realn_dir,'s',reference_fasta_path, rg_str,thread_count,engine,logger) bam_path_list.append(bam_path) for o1read in o1fastqlist: rg_str=bam_util.get_readgroup_str(o1read,readgroup_path_dict,logger) fastq_length=fastq_util.get_fastq_length(uuid,fastq_dir,o1read,engine,logger) if fastq_length<MEM_ALN_CUTOFF: bam_path=bwa_aln_single(uuid,bam_path,fastq_dir,o1read,realn_dir,'o1',reference_fasta_path, rg_str,thread_count,engine,logger) bam_path_list.append(bam_path) else: bam_path=bwa_mem_single(uuid,bam_path,fastq_dir,o1read,realn_dir,'o1',reference_fasta_path, rg_str,thread_count,engine,logger) bam_path_list.append(bam_path) for o2read in o2fastqlist: rg_str=bam_util.get_readgroup_str(o2read,readgroup_path_dict,logger) fastq_length=fastq_util.get_fastq_length(uuid,fastq_dir,o2read,engine,logger) if fastq_length<MEM_ALN_CUTOFF: bam_path=bwa_aln_single(uuid,bam_path,fastq_dir,o2read,realn_dir,'o2',reference_fasta_path, rg_str,thread_count,engine,logger) bam_path_list.append(bam_path) else: bam_path=bwa_mem_single(uuid,bam_path,fastq_dir,o2read,realn_dir,'o2',reference_fasta_path, rg_str,thread_count,engine,logger) bam_path_list.append(bam_path) return bam_path_list