예제 #1
0
def stats_samfrag(samfrag_pairs, sample_size=100000):

    from pairtools import _fileio, _pairsam_format, _headerops

    instream = _fileio.auto_open(samfrag_pairs, mode='r')
    _, body_stream = _headerops.get_header(instream)

    stats = defaultdict(int)
    libsize = []
    for line in body_stream:
        cols = line.rstrip().split(_pairsam_format.PAIRSAM_SEP)
        pos1 = int(cols[_pairsam_format.COL_P1])
        strand1 = cols[_pairsam_format.COL_S1]
        pos2 = int(cols[_pairsam_format.COL_P2])
        strand2 = cols[_pairsam_format.COL_S2]
        #fragstart, fragend = int(cols[-2]), int(cols[-1]) # The index may change in the future
        stats['120_SameFragmentReads'] += 1
        if (strand1=='+') and (strand2=='-'): # dangling reads
            stats['124_DanglingReads'] += 1
            libsize.append(pos2-pos1)
        elif (strand1=='-') and (strand2=='+'): # self ligation
            stats['122_SelfLigationReads'] += 1
        else:
            stats['126_UnknownMechanism'] += 1
    
    instream.close()
    
    os.remove(samfrag_pairs)

    libsize = np.r_[libsize]
    np.random.shuffle(libsize)
    libsize = libsize[:sample_size]
    
    return stats, libsize
예제 #2
0
def split_pairsam(pairsam_path):

    from pairtools import _fileio, _headerops
    
    # check for SAM information with the header of .pairsam.gz
    instream = _fileio.auto_open(pairsam_path, mode='r')
    header, _ = _headerops.get_header(instream)
    SAM = False
    for r in header:
        if not r.startswith('#columns:'):
            continue
        columns = r.split(':')[1].strip().split()
        if ('sam1' in columns) and ('sam2' in columns):
            SAM = True
    
    instream.close()
    pairpath = pairsam_path.replace('.pairsam.gz', '.pairs.gz')
    if SAM:
        bampath = pairsam_path.replace('.pairsam.gz', '.bam')
        split_command = ['pairtools', 'split', '--output-pairs', pairpath,
                         '--output-sam', bampath, pairsam_path]
        subprocess.check_call(' '.join(split_command), shell=True)
    else:
        mv_command = ['cp', pairsam_path, pairpath]
        subprocess.check_call(' '.join(mv_command), shell=True)
    
    # generate pairix index
    pairix_command = ['pairix', pairpath]
    subprocess.check_call(' '.join(pairix_command), shell=True)

    return pairpath
예제 #3
0
def stats_samfrag(samfrag_pairs, sample_size=100000):

    from pairtools import _fileio, _pairsam_format, _headerops

    instream = _fileio.auto_open(samfrag_pairs, mode='r')
    _, body_stream = _headerops.get_header(instream)

    stats = defaultdict(int)
    libsize = []
    for line in body_stream:
        cols = line.rstrip().split(_pairsam_format.PAIRSAM_SEP)
        pos1 = int(cols[_pairsam_format.COL_P1])
        strand1 = cols[_pairsam_format.COL_S1]
        pos2 = int(cols[_pairsam_format.COL_P2])
        strand2 = cols[_pairsam_format.COL_S2]
        #fragstart, fragend = int(cols[-2]), int(cols[-1]) # The index may change in the future
        stats['120_SameFragmentReads'] += 1
        if (strand1=='+') and (strand2=='-'): # dangling reads
            stats['124_DanglingReads'] += 1
            libsize.append(pos2-pos1)
        elif (strand1=='-') and (strand2=='+'): # self ligation
            stats['122_SelfLigationReads'] += 1
        else:
            stats['126_UnknownMechanism'] += 1
    
    instream.close()
    
    os.remove(samfrag_pairs)

    libsize = np.r_[libsize]
    np.random.shuffle(libsize)
    libsize = libsize[:sample_size]
    
    return stats, libsize
예제 #4
0
def split_pairsam(pairsam_path):

    from pairtools import _fileio, _headerops
    
    # check for SAM information with the header of .pairsam.gz
    instream = _fileio.auto_open(pairsam_path, mode='r')
    header, _ = _headerops.get_header(instream)
    SAM = False
    for r in header:
        if not r.startswith('#columns:'):
            continue
        columns = r.split(':')[1].strip().split()
        if ('sam1' in columns) and ('sam2' in columns):
            SAM = True
    
    instream.close()
    pairpath = pairsam_path.replace('.pairsam.gz', '.pairs.gz')
    if SAM:
        bampath = pairsam_path.replace('.pairsam.gz', '.bam')
        split_command = ['pairtools', 'split', '--output-pairs', pairpath,
                         '--output-sam', bampath, pairsam_path]
        subprocess.check_call(' '.join(split_command), shell=True)
    else:
        mv_command = ['cp', pairsam_path, pairpath]
        subprocess.check_call(' '.join(mv_command), shell=True)
    
    # generate pairix index
    pairix_command = ['pairix', pairpath]
    subprocess.check_call(' '.join(pairix_command), shell=True)

    return pairpath
예제 #5
0
def chromsizes_from_pairs(pairpath):

    from pairtools import _fileio, _headerops

    instream = _fileio.auto_open(pairpath, mode='r')
    header, _ = _headerops.get_header(instream)
    genomeName = 'Unknown'
    chromsizes = []
    for r in header:
        if r.startswith('#genome_assembly:'):
            genomeName = r.split(':')[1].strip()
        if r.startswith('#chromsize:'):
            pair = r.split(':')[1].strip().split()
            chromsizes.append(pair)

    folder = os.path.split(pairpath)[0]
    outpath = os.path.join(folder, '.' + genomeName +
                           '.chrom.sizes')  # invisible to users
    with open(outpath, 'w') as out:
        for line in chromsizes:  # order unchanged
            out.write('\t'.join(line) + '\n')

    instream.close()

    return outpath, genomeName
예제 #6
0
def restrict_py(pairs_path, frags, output, **kwargs):
    instream = (_fileio.auto_open(pairs_path,
                                  mode='r',
                                  nproc=kwargs.get('nproc_in'),
                                  command=kwargs.get('cmd_in', None))
                if pairs_path else sys.stdin)

    outstream = (_fileio.auto_open(output,
                                   mode='w',
                                   nproc=kwargs.get('nproc_out'),
                                   command=kwargs.get('cmd_out', None))
                 if output else sys.stdout)

    header, body_stream = _headerops.get_header(instream)
    header = _headerops.append_new_pg(header, ID=UTIL_NAME, PN=UTIL_NAME)
    if len(header) > 0:
        header[-1] = header[
            -1] + ' frag1_start frag1_end dist1_rsite frag2_start frag2_end dist2_rsite'
    outstream.writelines((l + '\n' for l in header))

    rfrags = pd.read_csv(frags,
                         delimiter="\t",
                         dtype=None,
                         comment="#",
                         names=['chrom', 'start', 'end'],
                         encoding='utf-8')
    rfrags = rfrags.to_records()

    chrom_borders = np.r_[0, 1 + np.where(
        rfrags['chrom'][:-1] != rfrags['chrom'][1:])[0], rfrags.shape[0]]
    rfrags = {
        rfrags['chrom'][i]: np.insert(rfrags['end'][i:j] + 1, 0, 1)
        for i, j in zip(chrom_borders[:-1], chrom_borders[1:])
    }

    for line in body_stream:
        cols = line.rstrip().split(_pairsam_format.PAIRSAM_SEP)
        # chrom1, pos1 = cols[_pairsam_format.COL_C1], int(cols[_pairsam_format.COL_P1])
        # rfrag_idx1, rfrag_start1, rfrag_end1 = find_rfrag(rfrags, chrom1, pos1)
        chrom1, pos1, strand1, cigar1 = cols[_pairsam_format.COL_C1], int(cols[_pairsam_format.COL_P1]), \
            cols[_pairsam_format.COL_S1], cols[10]
        rfrag_start1, rfrag_end1, dist1_rsite = find_rfrag(
            rfrags, chrom1, pos1, strand1, cigar1)
        cols += [str(rfrag_start1), str(rfrag_end1), str(dist1_rsite)]
        chrom2, pos2, strand2, cigar2 = cols[_pairsam_format.COL_C2], int(cols[_pairsam_format.COL_P2]), \
            cols[_pairsam_format.COL_S2], cols[11]
        rfrag_start2, rfrag_end2, dist2_rsite = find_rfrag(
            rfrags, chrom2, pos2, strand2, cigar2)
        cols += [str(rfrag_start2), str(rfrag_end2), str(dist2_rsite)]
        outstream.write(_pairsam_format.PAIRSAM_SEP.join(cols))
        outstream.write('\n')

    if instream != sys.stdin:
        instream.close()
    if outstream != sys.stdout:
        outstream.close()
예제 #7
0
def chromsizes_from_pairs(pairpath):

    from pairtools import _fileio, _headerops

    instream = _fileio.auto_open(pairpath, mode='r')
    header, _ = _headerops.get_header(instream)
    genomeName = 'Unknown'
    chromsizes = []
    for r in header:
        if r.startswith('#genome_assembly:'):
            genomeName = r.split(':')[1].strip()
        if r.startswith('#chromsize:'):
            pair = r.split(':')[1].strip().split()
            chromsizes.append(pair)
    
    folder = os.path.split(pairpath)[0]
    outpath = os.path.join(folder, '.'+genomeName+'.chrom.sizes') # invisible to users
    with open(outpath, 'w') as out:
        for line in chromsizes: # order unchanged
            out.write('\t'.join(line)+'\n')
    
    instream.close()
    
    return outpath, genomeName
예제 #8
0
def annotate_pairs(pairs_path, ant, ant_mode, ant_col, strand_type, min_over,
                   cigar_col, output, **kwargs):
    instream = (_fileio.auto_open(pairs_path,
                                  mode='r',
                                  nproc=kwargs.get('nproc_in'),
                                  command=kwargs.get('cmd_in', None))
                if pairs_path else sys.stdin)

    outstream = (_fileio.auto_open(output,
                                   mode='w',
                                   nproc=kwargs.get('nproc_out'),
                                   command=kwargs.get('cmd_out', None))
                 if output else sys.stdout)

    header, body_stream = _headerops.get_header(instream)
    header = _headerops.append_new_pg(header, ID=UTIL_NAME, PN=UTIL_NAME)

    if len(header) == 0:
        sys.stderr.write('.pairs file doesn\'t have header rows!\n')
        raise SystemExit(1)

    col_names = header[-1].split(' ')
    if col_names[0] != '#columns:':
        sys.stderr.write(
            'The last row of .pairs header is not a valid col_names row (start with \'#columns:\')!\n'
        )
        raise SystemExit(1)
    col_names.pop(0)

    ant_col = ant_col.split(',')
    for i in ant_col:
        if i in col_names:
            sys.stderr.write(
                'Annotation col names already exist in .pairs file!\n')
            raise SystemExit(1)

    for i in strand_type:
        if i not in ['s', 'r', 'n']:
            sys.stderr.write('Invalid strand specific type for annotation!\n')
            raise SystemExit(1)
    if ant_mode.lower() == 'both':
        header[-1] = header[-1] + ' ' + ' '.join(ant_col)
    else:
        header[-1] = header[-1] + ' ' + ant_col[0]

    min_over = [int(i) for i in min_over.split(',')]

    cigar_col = cigar_col.split(',')
    cigar_idx = []
    for i in cigar_col:
        if i not in col_names and i.lower() != 'false':
            sys.stderr.write(
                'Cigar col names doesn\'t exist in .pairs file!\n')
            raise SystemExit(1)
        else:
            cigar_idx += [col_names.index(i)]

    outstream.writelines(l + '\n' for l in header)
    count_line = 1
    for line in body_stream:
        if count_line % 1000000 == 0:
            print("%d records processed ..." % count_line)
        count_line += 1

        cols = line.rstrip().split(_pairsam_format.PAIRSAM_SEP)

        if ant_mode.lower() == 'rna':
            chrom1, pos1, strand1, cigar1 = cols[_pairsam_format.COL_C1], int(cols[_pairsam_format.COL_P1]), \
                cols[_pairsam_format.COL_S1], cols[cigar_idx[0]]
            if cigar_idx[0] == 'false':
                match_length1 = 1
            else:
                if cigar1 == '*':
                    cigar1 = '1M'
                match_length1 = sum(
                    [i.ref_iv.length for i in HTSeq.parse_cigar(cigar1)])
            ant_str = annotate_region(ant, chrom1, pos1, strand1,
                                      match_length1, min_over[0],
                                      strand_type[0])
        elif ant_mode.lower() == 'dna':
            chrom2, pos2, strand2, cigar2 = cols[_pairsam_format.COL_C2], int(cols[_pairsam_format.COL_P2]), \
                cols[_pairsam_format.COL_S2], cols[cigar_idx[0]]
            if cigar_idx[0] == 'false':
                match_length2 = 1
            else:
                if cigar2 == '*':
                    cigar2 = '1M'
                match_length2 = sum(
                    [i.ref_iv.length for i in HTSeq.parse_cigar(cigar2)])
            ant_str = annotate_region(ant, chrom2, pos2, strand2,
                                      match_length2, min_over[0],
                                      strand_type[0])
        else:
            chrom1, pos1, strand1, cigar1 = cols[_pairsam_format.COL_C1], int(cols[_pairsam_format.COL_P1]), \
                cols[_pairsam_format.COL_S1], cols[cigar_idx[0]]
            if cigar_idx[0] == 'false':
                match_length1 = 1
            else:
                if cigar1 == '*':
                    cigar1 = '1M'
                match_length1 = sum(
                    [i.ref_iv.length for i in HTSeq.parse_cigar(cigar1)])
            ant_str = annotate_region(ant, chrom1, pos1, strand1,
                                      match_length1, min_over[0],
                                      strand_type[0])
            chrom2, pos2, strand2, cigar2 = cols[_pairsam_format.COL_C2], int(cols[_pairsam_format.COL_P2]), \
                cols[_pairsam_format.COL_S2], cols[cigar_idx[1]]
            if cigar_idx[1] == 'false':
                match_length2 = 1
            else:
                if cigar2 == '*':
                    cigar2 = '1M'
                match_length2 = sum(
                    [i.ref_iv.length for i in HTSeq.parse_cigar(cigar2)])
            ant_str += _pairsam_format.PAIRSAM_SEP + \
                annotate_region(ant, chrom2, pos2, strand2, match_length2, min_over[1], strand_type[1])

        outstream.write(
            _pairsam_format.PAIRSAM_SEP.join([line.rstrip(), ant_str]))
        outstream.write('\n')

    if instream != sys.stdin:
        instream.close()
    if outstream != sys.stdout:
        outstream.close()