def restrict_py(pairs_path, frags, output, **kwargs): instream = (_fileio.auto_open(pairs_path, mode='r', nproc=kwargs.get('nproc_in'), command=kwargs.get('cmd_in', None)) if pairs_path else sys.stdin) outstream = (_fileio.auto_open(output, mode='w', nproc=kwargs.get('nproc_out'), command=kwargs.get('cmd_out', None)) if output else sys.stdout) header, body_stream = _headerops.get_header(instream) header = _headerops.append_new_pg(header, ID=UTIL_NAME, PN=UTIL_NAME) if len(header) > 0: header[-1] = header[ -1] + ' frag1_start frag1_end dist1_rsite frag2_start frag2_end dist2_rsite' outstream.writelines((l + '\n' for l in header)) rfrags = pd.read_csv(frags, delimiter="\t", dtype=None, comment="#", names=['chrom', 'start', 'end'], encoding='utf-8') rfrags = rfrags.to_records() chrom_borders = np.r_[0, 1 + np.where( rfrags['chrom'][:-1] != rfrags['chrom'][1:])[0], rfrags.shape[0]] rfrags = { rfrags['chrom'][i]: np.insert(rfrags['end'][i:j] + 1, 0, 1) for i, j in zip(chrom_borders[:-1], chrom_borders[1:]) } for line in body_stream: cols = line.rstrip().split(_pairsam_format.PAIRSAM_SEP) # chrom1, pos1 = cols[_pairsam_format.COL_C1], int(cols[_pairsam_format.COL_P1]) # rfrag_idx1, rfrag_start1, rfrag_end1 = find_rfrag(rfrags, chrom1, pos1) chrom1, pos1, strand1, cigar1 = cols[_pairsam_format.COL_C1], int(cols[_pairsam_format.COL_P1]), \ cols[_pairsam_format.COL_S1], cols[10] rfrag_start1, rfrag_end1, dist1_rsite = find_rfrag( rfrags, chrom1, pos1, strand1, cigar1) cols += [str(rfrag_start1), str(rfrag_end1), str(dist1_rsite)] chrom2, pos2, strand2, cigar2 = cols[_pairsam_format.COL_C2], int(cols[_pairsam_format.COL_P2]), \ cols[_pairsam_format.COL_S2], cols[11] rfrag_start2, rfrag_end2, dist2_rsite = find_rfrag( rfrags, chrom2, pos2, strand2, cigar2) cols += [str(rfrag_start2), str(rfrag_end2), str(dist2_rsite)] outstream.write(_pairsam_format.PAIRSAM_SEP.join(cols)) outstream.write('\n') if instream != sys.stdin: instream.close() if outstream != sys.stdout: outstream.close()
def stats_samfrag(samfrag_pairs, sample_size=100000): from pairtools import _fileio, _pairsam_format, _headerops instream = _fileio.auto_open(samfrag_pairs, mode='r') _, body_stream = _headerops.get_header(instream) stats = defaultdict(int) libsize = [] for line in body_stream: cols = line.rstrip().split(_pairsam_format.PAIRSAM_SEP) pos1 = int(cols[_pairsam_format.COL_P1]) strand1 = cols[_pairsam_format.COL_S1] pos2 = int(cols[_pairsam_format.COL_P2]) strand2 = cols[_pairsam_format.COL_S2] #fragstart, fragend = int(cols[-2]), int(cols[-1]) # The index may change in the future stats['120_SameFragmentReads'] += 1 if (strand1=='+') and (strand2=='-'): # dangling reads stats['124_DanglingReads'] += 1 libsize.append(pos2-pos1) elif (strand1=='-') and (strand2=='+'): # self ligation stats['122_SelfLigationReads'] += 1 else: stats['126_UnknownMechanism'] += 1 instream.close() os.remove(samfrag_pairs) libsize = np.r_[libsize] np.random.shuffle(libsize) libsize = libsize[:sample_size] return stats, libsize
def split_pairsam(pairsam_path): from pairtools import _fileio, _headerops # check for SAM information with the header of .pairsam.gz instream = _fileio.auto_open(pairsam_path, mode='r') header, _ = _headerops.get_header(instream) SAM = False for r in header: if not r.startswith('#columns:'): continue columns = r.split(':')[1].strip().split() if ('sam1' in columns) and ('sam2' in columns): SAM = True instream.close() pairpath = pairsam_path.replace('.pairsam.gz', '.pairs.gz') if SAM: bampath = pairsam_path.replace('.pairsam.gz', '.bam') split_command = ['pairtools', 'split', '--output-pairs', pairpath, '--output-sam', bampath, pairsam_path] subprocess.check_call(' '.join(split_command), shell=True) else: mv_command = ['cp', pairsam_path, pairpath] subprocess.check_call(' '.join(mv_command), shell=True) # generate pairix index pairix_command = ['pairix', pairpath] subprocess.check_call(' '.join(pairix_command), shell=True) return pairpath
def stats_samfrag(samfrag_pairs, sample_size=100000): from pairtools import _fileio, _pairsam_format, _headerops instream = _fileio.auto_open(samfrag_pairs, mode='r') _, body_stream = _headerops.get_header(instream) stats = defaultdict(int) libsize = [] for line in body_stream: cols = line.rstrip().split(_pairsam_format.PAIRSAM_SEP) pos1 = int(cols[_pairsam_format.COL_P1]) strand1 = cols[_pairsam_format.COL_S1] pos2 = int(cols[_pairsam_format.COL_P2]) strand2 = cols[_pairsam_format.COL_S2] #fragstart, fragend = int(cols[-2]), int(cols[-1]) # The index may change in the future stats['120_SameFragmentReads'] += 1 if (strand1=='+') and (strand2=='-'): # dangling reads stats['124_DanglingReads'] += 1 libsize.append(pos2-pos1) elif (strand1=='-') and (strand2=='+'): # self ligation stats['122_SelfLigationReads'] += 1 else: stats['126_UnknownMechanism'] += 1 instream.close() os.remove(samfrag_pairs) libsize = np.r_[libsize] np.random.shuffle(libsize) libsize = libsize[:sample_size] return stats, libsize
def split_pairsam(pairsam_path): from pairtools import _fileio, _headerops # check for SAM information with the header of .pairsam.gz instream = _fileio.auto_open(pairsam_path, mode='r') header, _ = _headerops.get_header(instream) SAM = False for r in header: if not r.startswith('#columns:'): continue columns = r.split(':')[1].strip().split() if ('sam1' in columns) and ('sam2' in columns): SAM = True instream.close() pairpath = pairsam_path.replace('.pairsam.gz', '.pairs.gz') if SAM: bampath = pairsam_path.replace('.pairsam.gz', '.bam') split_command = ['pairtools', 'split', '--output-pairs', pairpath, '--output-sam', bampath, pairsam_path] subprocess.check_call(' '.join(split_command), shell=True) else: mv_command = ['cp', pairsam_path, pairpath] subprocess.check_call(' '.join(mv_command), shell=True) # generate pairix index pairix_command = ['pairix', pairpath] subprocess.check_call(' '.join(pairix_command), shell=True) return pairpath
def chromsizes_from_pairs(pairpath): from pairtools import _fileio, _headerops instream = _fileio.auto_open(pairpath, mode='r') header, _ = _headerops.get_header(instream) genomeName = 'Unknown' chromsizes = [] for r in header: if r.startswith('#genome_assembly:'): genomeName = r.split(':')[1].strip() if r.startswith('#chromsize:'): pair = r.split(':')[1].strip().split() chromsizes.append(pair) folder = os.path.split(pairpath)[0] outpath = os.path.join(folder, '.' + genomeName + '.chrom.sizes') # invisible to users with open(outpath, 'w') as out: for line in chromsizes: # order unchanged out.write('\t'.join(line) + '\n') instream.close() return outpath, genomeName
def chromsizes_from_pairs(pairpath): from pairtools import _fileio, _headerops instream = _fileio.auto_open(pairpath, mode='r') header, _ = _headerops.get_header(instream) genomeName = 'Unknown' chromsizes = [] for r in header: if r.startswith('#genome_assembly:'): genomeName = r.split(':')[1].strip() if r.startswith('#chromsize:'): pair = r.split(':')[1].strip().split() chromsizes.append(pair) folder = os.path.split(pairpath)[0] outpath = os.path.join(folder, '.'+genomeName+'.chrom.sizes') # invisible to users with open(outpath, 'w') as out: for line in chromsizes: # order unchanged out.write('\t'.join(line)+'\n') instream.close() return outpath, genomeName
def annotate_pairs(pairs_path, ant, ant_mode, ant_col, strand_type, min_over, cigar_col, output, **kwargs): instream = (_fileio.auto_open(pairs_path, mode='r', nproc=kwargs.get('nproc_in'), command=kwargs.get('cmd_in', None)) if pairs_path else sys.stdin) outstream = (_fileio.auto_open(output, mode='w', nproc=kwargs.get('nproc_out'), command=kwargs.get('cmd_out', None)) if output else sys.stdout) header, body_stream = _headerops.get_header(instream) header = _headerops.append_new_pg(header, ID=UTIL_NAME, PN=UTIL_NAME) if len(header) == 0: sys.stderr.write('.pairs file doesn\'t have header rows!\n') raise SystemExit(1) col_names = header[-1].split(' ') if col_names[0] != '#columns:': sys.stderr.write( 'The last row of .pairs header is not a valid col_names row (start with \'#columns:\')!\n' ) raise SystemExit(1) col_names.pop(0) ant_col = ant_col.split(',') for i in ant_col: if i in col_names: sys.stderr.write( 'Annotation col names already exist in .pairs file!\n') raise SystemExit(1) for i in strand_type: if i not in ['s', 'r', 'n']: sys.stderr.write('Invalid strand specific type for annotation!\n') raise SystemExit(1) if ant_mode.lower() == 'both': header[-1] = header[-1] + ' ' + ' '.join(ant_col) else: header[-1] = header[-1] + ' ' + ant_col[0] min_over = [int(i) for i in min_over.split(',')] cigar_col = cigar_col.split(',') cigar_idx = [] for i in cigar_col: if i not in col_names and i.lower() != 'false': sys.stderr.write( 'Cigar col names doesn\'t exist in .pairs file!\n') raise SystemExit(1) else: cigar_idx += [col_names.index(i)] outstream.writelines(l + '\n' for l in header) count_line = 1 for line in body_stream: if count_line % 1000000 == 0: print("%d records processed ..." % count_line) count_line += 1 cols = line.rstrip().split(_pairsam_format.PAIRSAM_SEP) if ant_mode.lower() == 'rna': chrom1, pos1, strand1, cigar1 = cols[_pairsam_format.COL_C1], int(cols[_pairsam_format.COL_P1]), \ cols[_pairsam_format.COL_S1], cols[cigar_idx[0]] if cigar_idx[0] == 'false': match_length1 = 1 else: if cigar1 == '*': cigar1 = '1M' match_length1 = sum( [i.ref_iv.length for i in HTSeq.parse_cigar(cigar1)]) ant_str = annotate_region(ant, chrom1, pos1, strand1, match_length1, min_over[0], strand_type[0]) elif ant_mode.lower() == 'dna': chrom2, pos2, strand2, cigar2 = cols[_pairsam_format.COL_C2], int(cols[_pairsam_format.COL_P2]), \ cols[_pairsam_format.COL_S2], cols[cigar_idx[0]] if cigar_idx[0] == 'false': match_length2 = 1 else: if cigar2 == '*': cigar2 = '1M' match_length2 = sum( [i.ref_iv.length for i in HTSeq.parse_cigar(cigar2)]) ant_str = annotate_region(ant, chrom2, pos2, strand2, match_length2, min_over[0], strand_type[0]) else: chrom1, pos1, strand1, cigar1 = cols[_pairsam_format.COL_C1], int(cols[_pairsam_format.COL_P1]), \ cols[_pairsam_format.COL_S1], cols[cigar_idx[0]] if cigar_idx[0] == 'false': match_length1 = 1 else: if cigar1 == '*': cigar1 = '1M' match_length1 = sum( [i.ref_iv.length for i in HTSeq.parse_cigar(cigar1)]) ant_str = annotate_region(ant, chrom1, pos1, strand1, match_length1, min_over[0], strand_type[0]) chrom2, pos2, strand2, cigar2 = cols[_pairsam_format.COL_C2], int(cols[_pairsam_format.COL_P2]), \ cols[_pairsam_format.COL_S2], cols[cigar_idx[1]] if cigar_idx[1] == 'false': match_length2 = 1 else: if cigar2 == '*': cigar2 = '1M' match_length2 = sum( [i.ref_iv.length for i in HTSeq.parse_cigar(cigar2)]) ant_str += _pairsam_format.PAIRSAM_SEP + \ annotate_region(ant, chrom2, pos2, strand2, match_length2, min_over[1], strand_type[1]) outstream.write( _pairsam_format.PAIRSAM_SEP.join([line.rstrip(), ant_str])) outstream.write('\n') if instream != sys.stdin: instream.close() if outstream != sys.stdout: outstream.close()