def dedup(out_total, outpath, stats, nproc_in, nproc_out): pipeline = [] try: dedup_command = ['pairtools', 'dedup', '--max-mismatch', '1', '--method', 'max', '--nproc-in', str(nproc_in), '--nproc-out', str(nproc_out), '-o', outpath, out_total] pipeline.append( subprocess.Popen(dedup_command, stdout=None, bufsize=-1) ) pipeline[-1].wait() finally: sleep() for process in pipeline: if process.poll() is None: process.terminate() os.remove(out_total) refkey = {'cis':'410_IntraChromosomalReads', 'trans':'420_InterChromosomalReads', 'cis_20kb+':'412_IntraLongRangeReads(>=20Kb)', 'total_nodups':'total_nodups'} substats = stats_pairs(outpath, refkey, matchpre=['dist_freq'], nproc_in=nproc_in, nproc_out=nproc_out) stats['130_DuplicateRemoved'] = stats['110_AfterFilteringReads'] - substats['total_nodups'] stats['110_AfterFilteringReads'] = substats['total_nodups'] stats['400_TotalContacts'] = stats['110_AfterFilteringReads'] stats.update(substats) stats['412_IntraShortRangeReads(<20Kb)'] = stats['410_IntraChromosomalReads'] - stats['412_IntraLongRangeReads(>=20Kb)'] del stats['total_nodups']
def map_core(fastq_1, fastq_2, ref, outdir, aligner='minimap2', outformat='SAM', nthread=1): outformat = '.' + outformat.lower() # output file name if fastq_1.endswith('_1.fastq.gz'): outpath = os.path.join( outdir, os.path.split(fastq_1)[1].replace('_1.fastq.gz', outformat)) else: outpath = os.path.join( outdir, os.path.split(fastq_1)[1].replace('_1.fastq', outformat)) # ref: reference genome index if aligner == 'minimap2': map_command = [ 'minimap2', '-ax', 'sr', '-t', str(nthread), ref, fastq_1, fastq_2 ] else: map_command = [ 'bwa', 'mem', '-SP5M', '-t', str(nthread), ref, fastq_1, fastq_2 ] if outformat == '.sam': bam_command = [] else: bam_command = ['samtools', 'view', '-bS', '-'] pipeline = [] try: # Mapping pipeline.append( subprocess.Popen( map_command, stdout=subprocess.PIPE if bam_command else open(outpath, 'wb'), bufsize=-1)) if bam_command: pipeline.append( subprocess.Popen(bam_command, stdin=pipeline[-1].stdout, stdout=open(outpath, 'wb'), bufsize=-1)) pipeline[-1].wait() finally: sleep() for process in pipeline: if process.poll() is None: process.terminate() return outpath
def map_core(fastq_1, fastq_2, ref, outdir, aligner='minimap2', outformat='SAM', nthread=1): outformat = '.' + outformat.lower() # output file name if fastq_1.endswith('_1.fastq.gz'): outpath = os.path.join(outdir, os.path.split(fastq_1)[1].replace('_1.fastq.gz',outformat)) else: outpath = os.path.join(outdir, os.path.split(fastq_1)[1].replace('_1.fastq',outformat)) # ref: reference genome index if aligner=='minimap2': map_command = ['minimap2', '-ax', 'sr', '-t', str(nthread), ref, fastq_1, fastq_2] else: map_command = ['bwa', 'mem', '-SP', '-t', str(nthread), ref, fastq_1, fastq_2] if outformat=='.sam': bam_command = [] else: bam_command = ['samtools', 'view', '-bS', '-'] pipeline = [] try: # Mapping pipeline.append( subprocess.Popen(map_command, stdout=subprocess.PIPE if bam_command else open(outpath, 'wb'), bufsize=-1)) if bam_command: pipeline.append( subprocess.Popen(bam_command, stdin=pipeline[-1].stdout, stdout=open(outpath, 'wb'), bufsize=-1)) pipeline[-1].wait() finally: sleep() for process in pipeline: if process.poll() is None: process.terminate() return outpath
def biorep_level(pair_paths, outpre, frag_path, tmpdir): # a temporary file to store unfiltered all alignments out_total = outpre + '.total.pairsam.gz' merge_pairs(pair_paths, out_total, tmpdir) stats = collect_stats(pair_paths)['pseudo'] # Final biorep level pairsam outpath = outpre + '.pairsam.gz' # select.dedup.filter pipeline = [] try: dedup_command = [ 'pairtools', 'dedup', '--max-mismatch', '1', '--method', 'max', '-o', outpath, out_total ] pipeline.append( subprocess.Popen(dedup_command, stdout=None, bufsize=-1)) pipeline[-1].wait() finally: sleep() for process in pipeline: if process.poll() is None: process.terminate() os.remove(out_total) refkey = { 'cis': '410_IntraChromosomalReads', 'trans': '420_InterChromosomalReads', 'cis_20kb+': '412_IntraLongRangeReads(>=20Kb)', 'total_nodups': 'total_nodups' } substats = stats_pairs(outpath, refkey, matchpre=['dist_freq']) stats['130_DuplicateRemoved'] = stats[ '110_AfterFilteringReads'] - substats['total_nodups'] stats['110_AfterFilteringReads'] = substats['total_nodups'] stats['400_TotalContacts'] = stats['110_AfterFilteringReads'] stats.update(substats) stats['412_IntraShortRangeReads(<20Kb)'] = stats[ '410_IntraChromosomalReads'] - stats['412_IntraLongRangeReads(>=20Kb)'] del stats['total_nodups'] return stats, outpath
def biorep_level(pair_paths, outpre, frag_path, tmpdir): # a temporary file to store unfiltered all alignments out_total = outpre + '.total.pairsam.gz' merge_pairs(pair_paths, out_total, tmpdir) stats = collect_stats(pair_paths)['pseudo'] # Final biorep level pairsam outpath = outpre + '.pairsam.gz' # select.dedup.filter pipeline = [] try: dedup_command = ['pairtools', 'dedup', '--max-mismatch', '1', '--method', 'max', '-o', outpath, out_total] pipeline.append( subprocess.Popen(dedup_command, stdout=None, bufsize=-1) ) pipeline[-1].wait() finally: sleep() for process in pipeline: if process.poll() is None: process.terminate() os.remove(out_total) refkey = {'cis':'410_IntraChromosomalReads', 'trans':'420_InterChromosomalReads', 'cis_20kb+':'412_IntraLongRangeReads(>=20Kb)', 'total_nodups':'total_nodups'} substats = stats_pairs(outpath, refkey, matchpre=['dist_freq']) stats['130_DuplicateRemoved'] = stats['110_AfterFilteringReads'] - substats['total_nodups'] stats['110_AfterFilteringReads'] = substats['total_nodups'] stats['400_TotalContacts'] = stats['110_AfterFilteringReads'] stats.update(substats) stats['412_IntraShortRangeReads(<20Kb)'] = stats['410_IntraChromosomalReads'] - stats['412_IntraLongRangeReads(>=20Kb)'] del stats['total_nodups'] return stats, outpath
def parse_bam(bam, outfile, genomepath, chromsizes, assembly, min_mapq, max_molecule_size, max_inter_align_gap, walks_policy, include_readid, include_sam, drop_seq, tmpdir, enzyme): frag_path = create_frag(genomepath, chromsizes, enzyme, tmpdir). ## hindIII fragment out_total = outfile.replace('.pairsam.gz', '.total.pairsam.gz') basic_command = ['pairtools', 'parse', '-c', chromsizes, '--assembly', assembly, '--min-mapq', str(min_mapq), '--max-molecule-size', str(max_molecule_size), '--max-inter-align-gap', str(max_inter_align_gap), '--walks-policy', walks_policy] if not include_readid: basic_command.append('--drop-readid') if not include_sam: basic_command.append('--drop-sam') if drop_seq: basic_command.append('--drop-seq') basic_command.append(bam) pipeline = [] try: pipeline.append( subprocess.Popen(basic_command, stdout=subprocess.PIPE, bufsize=-1) ) sort_command = ['pairtools', 'sort', '-o', out_total, '--nproc', '8', '--memory', '2G', '--tmpdir', tmpdir] pipeline.append( subprocess.Popen(sort_command, stdin=pipeline[-1].stdout, stdout=None, bufsize=-1) ) pipeline[-1].wait() finally: sleep() for process in pipeline: if process.poll() is None: process.terminate() # stats at the bottom level refkey = {'total':'000_SequencedReads', 'total_mapped':'010_DoubleSideMappedReads', 'total_single_sided_mapped':'020_SingleSideMappedReads', 'total_unmapped':'030_UnmappedReads' } stats = stats_pairs(out_total, refkey). ## use all unfiltered reads for the first stat, including mapped status stats['100_NormalPairs'] = stats['010_DoubleSideMappedReads'] outpath_1 = outfile.replace('.pairsam.gz', '.select.pairsam.gz'). ## total. to selected only UU and UR reads pipeline = [] try: select_command = ['pairtools', 'select', '(pair_type=="UU") or (pair_type=="UR") or (pair_type=="RU")', '-o', outpath_1, out_total] pipeline.append( subprocess.Popen(select_command, stdout=None, bufsize=-1 ) ) pipeline[-1].wait() finally: sleep() for process in pipeline: if process.poll() is None: process.terminate() os.remove(out_total) outpath_2 = outfile.replace('.pairsam.gz', '.select.samefrag.pairsam.gz') pipeline = [] try: # assign fragment restrict_command = ['pairtools', 'restrict', '-f', frag_path, outpath_1]. ## update the fragment information pipeline.append( subprocess.Popen(restrict_command, stdout=subprocess.PIPE, bufsize=-1) ) ####### COLS[-6]==COLS[-3], the index may change to follow pairtools select_command = ['pairtools', 'select', '--output-rest', outfile, '-o', outpath_2, '(COLS[-6]==COLS[-3]) and (chrom1==chrom2)'] ## outfile is .pairsam.gz and same fragment is selected.samefrag. ## only for the stat purpurse. keeped for the peak pile up purpose pipeline.append( subprocess.Popen(select_command, stdin=pipeline[-1].stdout, stdout=None, bufsize=-1) ) pipeline[-1].wait() finally: sleep() for process in pipeline: if process.poll() is None: process.terminate() os.remove(outpath_1) substats, libsize = stats_samfrag(outpath_2) ## same fragment stats['110_AfterFilteringReads'] = stats['100_NormalPairs'] - substats['120_SameFragmentReads'] stats['400_TotalContacts'] = stats['110_AfterFilteringReads'] stats.update(substats) stats['libsize'] = libsize stats_pool = {'pseudo': stats} stats_pre = outfile.replace('.pairsam.gz', '.pstats') # pickled stats outStatsCache(stats_pool, stats_pre)
def parse_bam(bam, outfile, genomepath, chromsizes, assembly, min_mapq, max_molecule_size, max_inter_align_gap, walks_policy, include_readid, include_sam, drop_seq, tmpdir, enzyme): frag_path = create_frag(genomepath, chromsizes, enzyme, tmpdir) out_total = outfile.replace('.pairsam.gz', '.total.pairsam.gz') basic_command = ['pairtools', 'parse', '-c', chromsizes, '--assembly', assembly, '--min-mapq', str(min_mapq), '--max-molecule-size', str(max_molecule_size), '--max-inter-align-gap', str(max_inter_align_gap), '--walks-policy', walks_policy] if not include_readid: basic_command.append('--drop-readid') if not include_sam: basic_command.append('--drop-sam') if drop_seq: basic_command.append('--drop-seq') basic_command.append(bam) pipeline = [] try: pipeline.append( subprocess.Popen(basic_command, stdout=subprocess.PIPE, bufsize=-1) ) sort_command = ['pairtools', 'sort', '-o', out_total, '--nproc', '8', '--memory', '2G', '--tmpdir', tmpdir] pipeline.append( subprocess.Popen(sort_command, stdin=pipeline[-1].stdout, stdout=None, bufsize=-1) ) pipeline[-1].wait() finally: sleep() for process in pipeline: if process.poll() is None: process.terminate() # stats at the bottom level refkey = {'total':'000_SequencedReads', 'total_mapped':'010_DoubleSideMappedReads', 'total_single_sided_mapped':'020_SingleSideMappedReads', 'total_unmapped':'030_UnmappedReads' } stats = stats_pairs(out_total, refkey) stats['100_NormalPairs'] = stats['010_DoubleSideMappedReads'] outpath_1 = outfile.replace('.pairsam.gz', '.select.pairsam.gz') pipeline = [] try: select_command = ['pairtools', 'select', '(pair_type=="UU") or (pair_type=="UR") or (pair_type=="RU")', '-o', outpath_1, out_total] pipeline.append( subprocess.Popen(select_command, stdout=None, bufsize=-1 ) ) pipeline[-1].wait() finally: sleep() for process in pipeline: if process.poll() is None: process.terminate() os.remove(out_total) outpath_2 = outfile.replace('.pairsam.gz', '.select.samefrag.pairsam.gz') pipeline = [] try: # assign fragment restrict_command = ['pairtools', 'restrict', '-f', frag_path, outpath_1] pipeline.append( subprocess.Popen(restrict_command, stdout=subprocess.PIPE, bufsize=-1) ) ####### COLS[-6]==COLS[-3], the index may change to follow pairtools select_command = ['pairtools', 'select', '--output-rest', outfile, '-o', outpath_2, '(COLS[-6]==COLS[-3]) and (chrom1==chrom2)'] pipeline.append( subprocess.Popen(select_command, stdin=pipeline[-1].stdout, stdout=None, bufsize=-1) ) pipeline[-1].wait() finally: sleep() for process in pipeline: if process.poll() is None: process.terminate() os.remove(outpath_1) substats, libsize = stats_samfrag(outpath_2) stats['110_AfterFilteringReads'] = stats['100_NormalPairs'] - substats['120_SameFragmentReads'] stats['400_TotalContacts'] = stats['110_AfterFilteringReads'] stats.update(substats) stats['libsize'] = libsize stats_pool = {'pseudo': stats} stats_pre = outfile.replace('.pairsam.gz', '.pstats') # pickled stats outStatsCache(stats_pool, stats_pre)