Exemplo n.º 1
0
def dedup(out_total, outpath, stats, nproc_in, nproc_out):

    pipeline = []
    try:
        dedup_command = ['pairtools', 'dedup', '--max-mismatch', '1', '--method', 'max',
                         '--nproc-in', str(nproc_in), '--nproc-out', str(nproc_out),
                         '-o', outpath, out_total]
        pipeline.append(
            subprocess.Popen(dedup_command,
                stdout=None,
                bufsize=-1)
        )
        pipeline[-1].wait()
    finally:
        sleep()
        for process in pipeline:
            if process.poll() is None:
                process.terminate()
    
    os.remove(out_total)

    refkey = {'cis':'410_IntraChromosomalReads',
              'trans':'420_InterChromosomalReads',
              'cis_20kb+':'412_IntraLongRangeReads(>=20Kb)',
              'total_nodups':'total_nodups'}
    
    substats = stats_pairs(outpath, refkey, matchpre=['dist_freq'], nproc_in=nproc_in, nproc_out=nproc_out)
    stats['130_DuplicateRemoved'] = stats['110_AfterFilteringReads'] - substats['total_nodups']
    stats['110_AfterFilteringReads'] = substats['total_nodups']
    stats['400_TotalContacts'] = stats['110_AfterFilteringReads']
    stats.update(substats)
    stats['412_IntraShortRangeReads(<20Kb)'] = stats['410_IntraChromosomalReads'] - stats['412_IntraLongRangeReads(>=20Kb)']
    del stats['total_nodups']
Exemplo n.º 2
0
def map_core(fastq_1,
             fastq_2,
             ref,
             outdir,
             aligner='minimap2',
             outformat='SAM',
             nthread=1):

    outformat = '.' + outformat.lower()
    # output file name
    if fastq_1.endswith('_1.fastq.gz'):
        outpath = os.path.join(
            outdir,
            os.path.split(fastq_1)[1].replace('_1.fastq.gz', outformat))
    else:
        outpath = os.path.join(
            outdir,
            os.path.split(fastq_1)[1].replace('_1.fastq', outformat))

    # ref: reference genome index
    if aligner == 'minimap2':
        map_command = [
            'minimap2', '-ax', 'sr', '-t',
            str(nthread), ref, fastq_1, fastq_2
        ]
    else:
        map_command = [
            'bwa', 'mem', '-SP5M', '-t',
            str(nthread), ref, fastq_1, fastq_2
        ]

    if outformat == '.sam':
        bam_command = []
    else:
        bam_command = ['samtools', 'view', '-bS', '-']

    pipeline = []
    try:
        # Mapping
        pipeline.append(
            subprocess.Popen(
                map_command,
                stdout=subprocess.PIPE if bam_command else open(outpath, 'wb'),
                bufsize=-1))

        if bam_command:
            pipeline.append(
                subprocess.Popen(bam_command,
                                 stdin=pipeline[-1].stdout,
                                 stdout=open(outpath, 'wb'),
                                 bufsize=-1))
        pipeline[-1].wait()
    finally:
        sleep()
        for process in pipeline:
            if process.poll() is None:
                process.terminate()

    return outpath
Exemplo n.º 3
0
def map_core(fastq_1, fastq_2, ref, outdir, aligner='minimap2', outformat='SAM', 
             nthread=1):

    outformat = '.' + outformat.lower()
    # output file name
    if fastq_1.endswith('_1.fastq.gz'):
        outpath = os.path.join(outdir,
                        os.path.split(fastq_1)[1].replace('_1.fastq.gz',outformat))
    else:
        outpath = os.path.join(outdir,
                        os.path.split(fastq_1)[1].replace('_1.fastq',outformat))
    
    # ref: reference genome index
    if aligner=='minimap2':
        map_command = ['minimap2', '-ax', 'sr', '-t', str(nthread), ref, fastq_1, fastq_2]
    else:
        map_command = ['bwa', 'mem', '-SP', '-t', str(nthread), ref, fastq_1, fastq_2]
    
    if outformat=='.sam':
        bam_command = []
    else:
        bam_command = ['samtools', 'view', '-bS', '-']
    
    pipeline = []
    try:
        # Mapping
        pipeline.append(
                subprocess.Popen(map_command,
                    stdout=subprocess.PIPE if bam_command else open(outpath, 'wb'),
                    bufsize=-1))
        
        if bam_command:
            pipeline.append(
                    subprocess.Popen(bam_command,
                        stdin=pipeline[-1].stdout,
                        stdout=open(outpath, 'wb'),
                        bufsize=-1))
        pipeline[-1].wait()
    finally:
        sleep()
        for process in pipeline:
            if process.poll() is None:
                process.terminate()
    
    return outpath
Exemplo n.º 4
0
def biorep_level(pair_paths, outpre, frag_path, tmpdir):

    # a temporary file to store unfiltered all alignments
    out_total = outpre + '.total.pairsam.gz'
    merge_pairs(pair_paths, out_total, tmpdir)
    stats = collect_stats(pair_paths)['pseudo']

    # Final biorep level pairsam
    outpath = outpre + '.pairsam.gz'  # select.dedup.filter

    pipeline = []
    try:
        dedup_command = [
            'pairtools', 'dedup', '--max-mismatch', '1', '--method', 'max',
            '-o', outpath, out_total
        ]
        pipeline.append(
            subprocess.Popen(dedup_command, stdout=None, bufsize=-1))
        pipeline[-1].wait()
    finally:
        sleep()
        for process in pipeline:
            if process.poll() is None:
                process.terminate()

    os.remove(out_total)

    refkey = {
        'cis': '410_IntraChromosomalReads',
        'trans': '420_InterChromosomalReads',
        'cis_20kb+': '412_IntraLongRangeReads(>=20Kb)',
        'total_nodups': 'total_nodups'
    }

    substats = stats_pairs(outpath, refkey, matchpre=['dist_freq'])
    stats['130_DuplicateRemoved'] = stats[
        '110_AfterFilteringReads'] - substats['total_nodups']
    stats['110_AfterFilteringReads'] = substats['total_nodups']
    stats['400_TotalContacts'] = stats['110_AfterFilteringReads']
    stats.update(substats)
    stats['412_IntraShortRangeReads(<20Kb)'] = stats[
        '410_IntraChromosomalReads'] - stats['412_IntraLongRangeReads(>=20Kb)']
    del stats['total_nodups']

    return stats, outpath
Exemplo n.º 5
0
def biorep_level(pair_paths, outpre, frag_path, tmpdir):

    # a temporary file to store unfiltered all alignments
    out_total = outpre + '.total.pairsam.gz'
    merge_pairs(pair_paths, out_total, tmpdir)
    stats = collect_stats(pair_paths)['pseudo']

    # Final biorep level pairsam
    outpath = outpre + '.pairsam.gz' # select.dedup.filter

    pipeline = []
    try:
        dedup_command = ['pairtools', 'dedup', '--max-mismatch', '1', '--method', 'max', '-o', outpath, out_total]
        pipeline.append(
            subprocess.Popen(dedup_command,
                stdout=None,
                bufsize=-1)
        )
        pipeline[-1].wait()
    finally:
        sleep()
        for process in pipeline:
            if process.poll() is None:
                process.terminate()
    
    os.remove(out_total)
    
    refkey = {'cis':'410_IntraChromosomalReads',
              'trans':'420_InterChromosomalReads',
              'cis_20kb+':'412_IntraLongRangeReads(>=20Kb)',
              'total_nodups':'total_nodups'}

    substats = stats_pairs(outpath, refkey, matchpre=['dist_freq'])
    stats['130_DuplicateRemoved'] = stats['110_AfterFilteringReads'] - substats['total_nodups']
    stats['110_AfterFilteringReads'] = substats['total_nodups']
    stats['400_TotalContacts'] = stats['110_AfterFilteringReads']
    stats.update(substats)
    stats['412_IntraShortRangeReads(<20Kb)'] = stats['410_IntraChromosomalReads'] - stats['412_IntraLongRangeReads(>=20Kb)']
    del stats['total_nodups']
    
    return stats, outpath
Exemplo n.º 6
0
def parse_bam(bam, outfile, genomepath, chromsizes, assembly, min_mapq, max_molecule_size, max_inter_align_gap,
              walks_policy, include_readid, include_sam, drop_seq, tmpdir, enzyme):
    
    frag_path = create_frag(genomepath, chromsizes, enzyme, tmpdir). ## hindIII fragment 
    out_total = outfile.replace('.pairsam.gz', '.total.pairsam.gz')
    
    basic_command = ['pairtools', 'parse', '-c', chromsizes, '--assembly', assembly,
                     '--min-mapq', str(min_mapq), '--max-molecule-size', str(max_molecule_size),
                     '--max-inter-align-gap', str(max_inter_align_gap), '--walks-policy', walks_policy]
    if not include_readid:
        basic_command.append('--drop-readid')
    
    if not include_sam:
        basic_command.append('--drop-sam')
    
    if drop_seq:
        basic_command.append('--drop-seq')
    basic_command.append(bam)
    
    pipeline = []
    try:
        pipeline.append(
            subprocess.Popen(basic_command,
                stdout=subprocess.PIPE,
                bufsize=-1)
        )

        sort_command = ['pairtools', 'sort', '-o', out_total, '--nproc', '8', '--memory', '2G', '--tmpdir', tmpdir]
        pipeline.append(
            subprocess.Popen(sort_command,
                stdin=pipeline[-1].stdout,
                stdout=None,
                bufsize=-1)
        )

        pipeline[-1].wait()

    finally:
        sleep()
        for process in pipeline:
            if process.poll() is None:
                process.terminate()
    
    # stats at the bottom level
    refkey = {'total':'000_SequencedReads',
              'total_mapped':'010_DoubleSideMappedReads',
              'total_single_sided_mapped':'020_SingleSideMappedReads',
              'total_unmapped':'030_UnmappedReads'
              }
    stats = stats_pairs(out_total, refkey). ## use all unfiltered reads for the first stat, including mapped status
    stats['100_NormalPairs'] = stats['010_DoubleSideMappedReads']

    outpath_1 = outfile.replace('.pairsam.gz', '.select.pairsam.gz'). ## total. to selected only UU and UR reads
    pipeline = []
    try:
        select_command = ['pairtools', 'select', '(pair_type=="UU") or (pair_type=="UR") or (pair_type=="RU")',
                          '-o', outpath_1, out_total]
        pipeline.append(
            subprocess.Popen(select_command,
                stdout=None,
                bufsize=-1
            )
        )

        pipeline[-1].wait()
    
    finally:
        sleep()
        for process in pipeline:
            if process.poll() is None:
                process.terminate()
    
    os.remove(out_total)

    outpath_2 = outfile.replace('.pairsam.gz', '.select.samefrag.pairsam.gz')
    pipeline = []
    try:
        # assign fragment
        restrict_command = ['pairtools', 'restrict', '-f', frag_path, outpath_1]. ## update the fragment information 
        pipeline.append(
            subprocess.Popen(restrict_command,
                stdout=subprocess.PIPE,
                bufsize=-1)
        )

        ####### COLS[-6]==COLS[-3], the index may change to follow pairtools
        select_command = ['pairtools', 'select', '--output-rest', outfile, '-o', outpath_2,
                          '(COLS[-6]==COLS[-3]) and (chrom1==chrom2)']  ## outfile is .pairsam.gz and same fragment is selected.samefrag. 
                                                                        ## only for the stat purpurse. keeped for the peak pile up purpose    
        pipeline.append(
            subprocess.Popen(select_command,
                stdin=pipeline[-1].stdout,
                stdout=None,
                bufsize=-1)
        )

        pipeline[-1].wait()
    finally:
        sleep()
        for process in pipeline:
            if process.poll() is None:
                process.terminate()
    
    os.remove(outpath_1)

    substats, libsize = stats_samfrag(outpath_2) ## same fragment 
    stats['110_AfterFilteringReads'] = stats['100_NormalPairs'] - substats['120_SameFragmentReads']
    stats['400_TotalContacts'] = stats['110_AfterFilteringReads']
    stats.update(substats)

    stats['libsize'] = libsize

    stats_pool = {'pseudo': stats}
    stats_pre = outfile.replace('.pairsam.gz', '.pstats') # pickled stats
    outStatsCache(stats_pool, stats_pre)
Exemplo n.º 7
0
def parse_bam(bam, outfile, genomepath, chromsizes, assembly, min_mapq, max_molecule_size, max_inter_align_gap,
              walks_policy, include_readid, include_sam, drop_seq, tmpdir, enzyme):
    
    frag_path = create_frag(genomepath, chromsizes, enzyme, tmpdir)
    out_total = outfile.replace('.pairsam.gz', '.total.pairsam.gz')
    
    basic_command = ['pairtools', 'parse', '-c', chromsizes, '--assembly', assembly,
                     '--min-mapq', str(min_mapq), '--max-molecule-size', str(max_molecule_size),
                     '--max-inter-align-gap', str(max_inter_align_gap), '--walks-policy', walks_policy]
    if not include_readid:
        basic_command.append('--drop-readid')
    
    if not include_sam:
        basic_command.append('--drop-sam')
    
    if drop_seq:
        basic_command.append('--drop-seq')
    basic_command.append(bam)
    
    pipeline = []
    try:
        pipeline.append(
            subprocess.Popen(basic_command,
                stdout=subprocess.PIPE,
                bufsize=-1)
        )

        sort_command = ['pairtools', 'sort', '-o', out_total, '--nproc', '8', '--memory', '2G', '--tmpdir', tmpdir]
        pipeline.append(
            subprocess.Popen(sort_command,
                stdin=pipeline[-1].stdout,
                stdout=None,
                bufsize=-1)
        )

        pipeline[-1].wait()

    finally:
        sleep()
        for process in pipeline:
            if process.poll() is None:
                process.terminate()
    
    # stats at the bottom level
    refkey = {'total':'000_SequencedReads',
              'total_mapped':'010_DoubleSideMappedReads',
              'total_single_sided_mapped':'020_SingleSideMappedReads',
              'total_unmapped':'030_UnmappedReads'
              }
    stats = stats_pairs(out_total, refkey)
    stats['100_NormalPairs'] = stats['010_DoubleSideMappedReads']

    outpath_1 = outfile.replace('.pairsam.gz', '.select.pairsam.gz')
    pipeline = []
    try:
        select_command = ['pairtools', 'select', '(pair_type=="UU") or (pair_type=="UR") or (pair_type=="RU")',
                          '-o', outpath_1, out_total]
        pipeline.append(
            subprocess.Popen(select_command,
                stdout=None,
                bufsize=-1
            )
        )

        pipeline[-1].wait()
    
    finally:
        sleep()
        for process in pipeline:
            if process.poll() is None:
                process.terminate()
    
    os.remove(out_total)

    outpath_2 = outfile.replace('.pairsam.gz', '.select.samefrag.pairsam.gz')
    pipeline = []
    try:
        # assign fragment
        restrict_command = ['pairtools', 'restrict', '-f', frag_path, outpath_1]
        pipeline.append(
            subprocess.Popen(restrict_command,
                stdout=subprocess.PIPE,
                bufsize=-1)
        )

        ####### COLS[-6]==COLS[-3], the index may change to follow pairtools
        select_command = ['pairtools', 'select', '--output-rest', outfile, '-o', outpath_2,
                          '(COLS[-6]==COLS[-3]) and (chrom1==chrom2)']
        pipeline.append(
            subprocess.Popen(select_command,
                stdin=pipeline[-1].stdout,
                stdout=None,
                bufsize=-1)
        )

        pipeline[-1].wait()
    finally:
        sleep()
        for process in pipeline:
            if process.poll() is None:
                process.terminate()
    
    os.remove(outpath_1)

    substats, libsize = stats_samfrag(outpath_2)
    stats['110_AfterFilteringReads'] = stats['100_NormalPairs'] - substats['120_SameFragmentReads']
    stats['400_TotalContacts'] = stats['110_AfterFilteringReads']
    stats.update(substats)

    stats['libsize'] = libsize

    stats_pool = {'pseudo': stats}
    stats_pre = outfile.replace('.pairsam.gz', '.pstats') # pickled stats
    outStatsCache(stats_pool, stats_pre)