def fastq(ifile, ofile1, ofile2): """ :param ifile: input bam file :param ofile1: first file of fastq pair :param ofile2: second file of fastq pair """ logging.info('generating fastq file for ' + str(ifile)) pysam.fastq('-1', ofile1, '-2', ofile2, '-n', ifile)
def simple_fastq(tmp_path, simple_bam): import pysam f = tmp_path / 'simple.fq' fq = pysam.fastq('-t', '-N', '-O', simple_bam) fq = fq.replace('\t', '_') f.write_text(fq) return str(f)
elif (len(sys.argv) == 4): bam_in = sys.argv[1] bed_path = sys.argv[2] out_prefix = sys.argv[3] else: sys.stderr.write( "Usage: bam2fq_for_realign.py miniBAM U1_BED [OUT_PREFIX]") sys.exit(1) mq_cut = 100 # reads < mapq_cut will be realigned bam_tmp = os.path.splitext(bam_in)[0] + '.tmp.bam' # Remove duplicates (1024) pysam.view('-o', bam_tmp, '-uF', '1024', '-f', '1', bam_in, catch_stdout=False) # Convert miniBAM to FASTQ fq = pysam.fastq('-O', bam_tmp).split('\n') # Convert FASTQ to dict fq_dict = {} for i in range(len(fq)): if i % 4 == 0: read_name = fq[i] elif i % 4 == 1: dna = fq[i] elif i % 4 == 3: qual = fq[i] # Record one read into dict fq_dict[read_name] = (dna, qual) read_names = list(fq_dict.keys()) # Convert miniBAM to BED U1U11 = BedTool(bed_path) bam = BedTool(bam_tmp)
def reconst_a(args, params, filenames, refseqid): log.logger.debug('started.') try: if args.p <= 2: thread_n = args.p elif args.p >= 3: thread_n = args.p - 1 if args.alignmentin is True: sample_name = os.path.basename( args.b) if not args.b is None else os.path.basename(args.c) else: sample_name = os.path.basename(args.fq1) pysam.view(filenames.mapped_to_virus_bam, '-h', '-o', filenames.tmp_bam, refseqid, catch_stdout=False) _, seq = utils.retrieve_only_one_virus_fasta(args.vref, refseqid) with open(filenames.tmp_fa, 'w') as outfile: outfile.write('>%s %s\n%s\n' % (refseqid, sample_name, seq)) pysam.faidx(filenames.tmp_fa) # mask low depth regions mask_low_depth(args, params, filenames, filenames.tmp_fa, refseqid) if os.path.exists(filenames.tmp_fa_dict) is True: os.remove(filenames.tmp_fa_dict) cmd = 'java -jar %s CreateSequenceDictionary R=%s O=%s' % ( args.picard, filenames.tmp_fa, filenames.tmp_fa_dict) log.logger.debug('picard command = `' + cmd + '`') out = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE) log.logger.debug( '\n' + '\n'.join([l.decode() for l in out.stderr.splitlines()])) if not out.returncode == 0: log.logger.error('Error occurred during gatk running.') exit(1) cmd = 'java -jar %s AddOrReplaceReadGroups I=%s O=%s RGLB=lib1 RGPL=ILLUMINA RGPU=unit1 RGSM=20' % ( args.picard, filenames.tmp_bam, filenames.tmp_rg_bam) log.logger.debug('picard command = `' + cmd + '`') out = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE) log.logger.debug( '\n' + '\n'.join([l.decode() for l in out.stderr.splitlines()])) if not out.returncode == 0: log.logger.error('Error occurred during gatk running.') exit(1) # pysam.index('-@', str(thread_n), filenames.tmp_rg_bam) pysam.index(filenames.tmp_rg_bam) cmd = 'gatk --java-options "-Xmx4g" HaplotypeCaller -R %s -I %s -O %s' % ( filenames.tmp_fa, filenames.tmp_rg_bam, filenames.hhv6a_vcf_gz) log.logger.debug('gatk command = `' + cmd + '`') out = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE) log.logger.debug( '\n' + '\n'.join([l.decode() for l in out.stderr.splitlines()])) if not out.returncode == 0: log.logger.error('Error occurred during gatk running.') exit(1) cmd = 'bcftools norm -c x -f %s %s -Oz -o %s' % ( filenames.tmp_masked_fa, filenames.hhv6a_vcf_gz, filenames.hhv6a_norm_vcf_gz) log.logger.debug('bcftools command = `' + cmd + '`') out = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE) log.logger.debug( '\n' + '\n'.join([l.decode() for l in out.stderr.splitlines()])) if not out.returncode == 0: log.logger.error('Error occurred during bcftools running.') exit(1) cmd = 'bcftools index %s' % filenames.hhv6a_norm_vcf_gz log.logger.debug('bcftools command = `' + cmd + '`') out = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE) log.logger.debug( '\n' + '\n'.join([l.decode() for l in out.stderr.splitlines()])) if not out.returncode == 0: log.logger.error('Error occurred during bcftools running.') exit(1) cmd = 'bcftools consensus -f %s -o %s %s' % ( filenames.tmp_masked_fa, filenames.hhv6a_gatk_naive, filenames.hhv6a_norm_vcf_gz) log.logger.debug('bcftools command = `' + cmd + '`') out = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE) log.logger.debug( '\n' + '\n'.join([l.decode() for l in out.stderr.splitlines()])) if not out.returncode == 0: log.logger.error('Error occurred during bcftools running.') exit(1) # remove unnecessary files os.remove(filenames.tmp_rg_bam) os.remove(filenames.tmp_rg_bam + '.bai') os.remove(filenames.tmp_fa) os.remove(filenames.tmp_fa + '.fai') os.remove(filenames.tmp_masked_fa) os.remove(filenames.tmp_masked_fa + '.fai') os.remove(filenames.tmp_fa_dict) if args.keep is False: os.remove(filenames.hhv6a_vcf_gz + '.tbi') os.remove(filenames.hhv6a_norm_vcf_gz) os.remove(filenames.hhv6a_norm_vcf_gz + '.csi') if args.denovo is True: pysam.sort('-@', '%d' % thread_n, '-n', '-O', 'BAM', '-o', filenames.tmp_sorted_bam, filenames.tmp_bam) pysam.fastq('-@', '%d' % thread_n, '-N', '-f', '1', '-F', '3852', '-0', '/dev/null', '-1', filenames.tmp_bam_fq1, '-2', filenames.tmp_bam_fq2, '-s', '/dev/null', filenames.tmp_sorted_bam) cmd = 'metaspades.py -1 %s -2 %s -k %s -t %d -m %d -o %s' % ( filenames.tmp_bam_fq1, filenames.tmp_bam_fq2, params.metaspades_kmer, thread_n, params.metaspades_memory, filenames.hhv6a_metaspades_o) log.logger.debug('metaspades command = `' + cmd + '`') out = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE) log.logger.debug( '\n' + '\n'.join([l.decode() for l in out.stderr.splitlines()])) if not out.returncode == 0: log.logger.error('Error occurred during metaspades running.') exit(1) # remove unnecessary files os.remove(filenames.tmp_sorted_bam) os.remove(filenames.tmp_bam_fq1) os.remove(filenames.tmp_bam_fq2) # remove unnecessary files os.remove(filenames.tmp_bam) except: log.logger.error('\n' + traceback.format_exc()) exit(1)
def map_to_dr(args, params, filenames, hhv6_refid): log.logger.debug('started.') try: if args.p <= 2: thread_n=args.p elif args.p >= 3: thread_n=args.p - 1 pysam.view('-bh', '-o', filenames.tmp_bam, filenames.mapped_to_virus_bam, hhv6_refid, catch_stdout=False) pysam.sort('-n', filenames.tmp_bam, '-o', filenames.tmp_sorted_bam) pysam.fastq('-N', '-0', '/dev/null', '-1', filenames.unmapped_merged_1, '-2', filenames.unmapped_merged_2, '-s', '/dev/null', filenames.tmp_sorted_bam) if args.fastqin is True and args.single is True: cmd='hisat2 --mp %s -t -x %s -p %d -U %s --no-spliced-alignment | samtools view -Sbh -o %s -' % (params.hisat2_mismatch_penalties, filenames.hhv6_dr_index, thread_n, filenames.unmapped_merged_1, filenames.mapped_unsorted_bam) else: cmd='hisat2 --mp %s -t -x %s -p %d -1 %s -2 %s --no-spliced-alignment | samtools view -Sbh -o %s -' % (params.hisat2_mismatch_penalties, filenames.hhv6_dr_index, thread_n, filenames.unmapped_merged_1, filenames.unmapped_merged_2, filenames.mapped_unsorted_bam) log.logger.debug('mapping command = `'+ cmd +'`') out=subprocess.run(cmd, shell=True, stderr=subprocess.PIPE) log.logger.debug('\n'+ '\n'.join([ l.decode() for l in out.stderr.splitlines() ])) if not out.returncode == 0: log.logger.error('Error occurred during mapping.') exit(1) if not args.keep is True: os.remove(filenames.unmapped_merged_1) os.remove(filenames.unmapped_merged_2) # sort pysam.sort('-@', str(thread_n), '-o', filenames.mapped_sorted, filenames.mapped_unsorted_bam) if not args.keep is True: os.remove(filenames.mapped_unsorted_bam) # mark duplicate cmd='java -Xms896m -Xmx5376m -jar %s MarkDuplicates CREATE_INDEX=true I=%s O=%s M=%s' % (args.picard, filenames.mapped_sorted, filenames.mapped_to_dr_bam, filenames.markdup_metrix_dr) log.logger.debug('picard command = `'+ cmd +'`') out=subprocess.run(cmd, shell=True, stderr=subprocess.PIPE) log.logger.debug('\n'+ '\n'.join([ l.decode() for l in out.stderr.splitlines() ])) if not out.returncode == 0: log.logger.error('\n'+ traceback.format_exc()) log.logger.error('Error occurred during gatk running.') exit(1) # remove unnecessary files os.remove(filenames.tmp_sorted_bam) if args.keep is False: os.remove(filenames.mapped_sorted) # check mapped = 0 global read_mapped read_mapped=True with open(filenames.markdup_metrix_dr) as infile: for line in infile: if 'Unknown Library' in line: ls=line.split() if int(ls[2]) == 0: read_mapped=False break # convert to bedgraph cmd='bamCoverage --outFileFormat bedgraph -p %d --binSize %d -b %s -o %s' % (thread_n, params.bedgraph_bin, filenames.mapped_to_dr_bam, filenames.bedgraph_dr) log.logger.debug('bamCoverage command = "'+ cmd +'"') out=subprocess.run(cmd, shell=True, stderr=subprocess.PIPE) log.logger.debug('\n'+ '\n'.join([ l.decode() for l in out.stderr.splitlines() ])) if not out.returncode == 0: log.logger.error('\n'+ traceback.format_exc()) log.logger.error('Error occurred during bamCoverage.') exit(1) except: log.logger.error('\n'+ traceback.format_exc()) exit(1)
def retrieve_unmapped_reads(args, params, filenames): log.logger.debug('started.') try: if args.p <= 2: thread_n = args.p elif args.p >= 3: thread_n = args.p - 1 # retrieve discordant reads, default if args.use_mate_mapped is False and args.all_discordant is False: if not args.b is None: pysam.view('-@', '%d' % thread_n, '-f', '12', '-F', '3842', '-b', '-o', filenames.discordant_bam, args.b, catch_stdout=False) elif not args.c is None: pysam.view('-@', '%d' % thread_n, '-f', '12', '-F', '3842', '-b', '-o', filenames.discordant_bam, '--reference', args.fa, args.c, catch_stdout=False) pysam.fastq('-@', '%d' % thread_n, '-N', '-0', '/dev/null', '-1', filenames.unmapped_merged_pre1, '-2', filenames.unmapped_merged_pre2, '-s', '/dev/null', filenames.discordant_bam) if args.keep is False: os.remove(filenames.discordant_bam) # retrieve discordant reads, non-default else: if not args.b is None: pysam.view('-@', '%d' % thread_n, '-f', '1', '-F', '3842', '-b', '-o', filenames.discordant_bam, args.b, catch_stdout=False) elif not args.c is None: pysam.view('-@', '%d' % thread_n, '-f', '1', '-F', '3842', '-b', '-o', filenames.discordant_bam, '--reference', args.fa, args.c, catch_stdout=False) pysam.sort('-@', '%d' % thread_n, '-n', '-O', 'BAM', '-o', filenames.discordant_sort_bam, filenames.discordant_bam) if args.keep is False: os.remove(filenames.discordant_bam) if args.all_discordant is True: pysam.fastq('-@', '%d' % thread_n, '-N', '-0', '/dev/null', '-1', filenames.unmapped_merged_pre1, '-2', filenames.unmapped_merged_pre2, '-s', '/dev/null', filenames.discordant_sort_bam) else: pysam.fastq('-@', '%d' % thread_n, '-f', '12', '-F', '3328', '-N', '-0', '/dev/null', '-1', filenames.unmapped_1, '-2', filenames.unmapped_2, '-s', '/dev/null', filenames.discordant_sort_bam) if args.use_mate_mapped is True: pysam.view('-@', '%d' % thread_n, '-f', '8', '-F', '3332', '-b', '-o', filenames.unmapped_bam_3, filenames.discordant_sort_bam, catch_stdout=False) pysam.view('-@', '%d' % thread_n, '-f', '4', '-F', '3336', '-b', '-o', filenames.unmapped_bam_4, filenames.discordant_sort_bam, catch_stdout=False) pysam.merge('-@', '%d' % thread_n, '-f', filenames.unmapped_bam_34, filenames.unmapped_bam_3, filenames.unmapped_bam_4) pysam.sort('-@', '%d' % thread_n, '-n', '-O', 'BAM', '-o', filenames.unmapped_sorted_34, filenames.unmapped_bam_34) pysam.fastq('-@', '%d' % thread_n, '-N', '-0', '/dev/null', '-1', filenames.unmapped_3, '-2', filenames.unmapped_4, '-s', '/dev/null', filenames.unmapped_sorted_34) # concatenate fastq with open(filenames.unmapped_merged_pre1, 'w') as outfile: for f in [filenames.unmapped_1, filenames.unmapped_3]: if os.path.exists(f) is True: with open(f) as infile: for line in infile: outfile.write(line) utils.gzip_or_del(args, params, f) with open(filenames.unmapped_merged_pre2, 'w') as outfile: for f in [filenames.unmapped_2, filenames.unmapped_4]: if os.path.exists(f) is True: with open(f) as infile: for line in infile: outfile.write(line) utils.gzip_or_del(args, params, f) # remove short reads infile1 = open(filenames.unmapped_merged_pre1) infile2 = open(filenames.unmapped_merged_pre2) outfile1 = open(filenames.unmapped_merged_1, 'w') outfile2 = open(filenames.unmapped_merged_2, 'w') min_seq_len = params.min_seq_len tmp1, tmp2 = [], [] for line1, line2 in zip(infile1, infile2): tmp1.append(line1) tmp2.append(line2) if len(tmp1) == 4: seqlen1 = len(tmp1[1].strip()) seqlen2 = len(tmp2[1].strip()) if seqlen1 >= min_seq_len and seqlen2 >= min_seq_len: outfile1.write(''.join(tmp1)) outfile2.write(''.join(tmp2)) tmp1, tmp2 = [], [] infile1.close() infile2.close() outfile1.close() outfile2.close() utils.gzip_or_del(args, params, filenames.unmapped_merged_pre1) utils.gzip_or_del(args, params, filenames.unmapped_merged_pre2) if args.keep is False: if os.path.exists(filenames.discordant_sort_bam) is True: os.remove(filenames.discordant_sort_bam) if args.use_mate_mapped is True: os.remove(filenames.unmapped_bam_3) os.remove(filenames.unmapped_bam_4) os.remove(filenames.unmapped_bam_34) os.remove(filenames.unmapped_sorted_34) except: log.logger.error('\n' + traceback.format_exc()) exit(1)
md5(location) # In[348]: ccs_result=run_ccs('/data/yangxiaoxia/bam_sequel/m54152_170704_111850.subreads.bam','/home/kechanglin/gen3ccs_new3.ccs.bam') ccs_result # In[354]: fastq_file=pysam.fastq('/home/kechanglin/gen3ccs_new3.ccs.bam') # In[358]: aa=os.system('samtools fastq '+'/home/kechanglin/gen3ccs_new3.ccs.bam '+'> '+'/home/kechanglin/data/newfq.fastq') # In[390]: b=run_cmd('fastp -i /home/kechanglin/data/newfq.fastq -o /home/kechanglin/data/fastp_newfq.fq') # In[393]: