def parse_fasta(f_names, chr_names=None, verbose=True): """ Parse a list of fasta files, or just one fasta. WARNING: The order is important :param f_names: list of pathes to files, or just a single path :param None chr_names: pass list of chromosome names, or just one. If None are passed, then chromosome names will be inferred from fasta headers :returns: a sorted dictionary with chromosome names as keys, and sequences as values (sequence in upper case) """ if isinstance(f_names, str): f_names = [f_names] if isinstance(chr_names, str): chr_names = [chr_names] genome_seq = OrderedDict() if len(f_names) == 1: header = None seq = [] for line in magic_open(f_names[0]): if line.startswith('>'): if header: genome_seq[header] = ''.join(seq).upper() if not chr_names: header = line[1:].split()[0] if verbose: print 'Parsing %s' % (header) else: header = chr_names.pop(0) if verbose: print 'Parsing %s as %s' % (line[1:].rstrip(), header) seq = [] continue seq.append(line.rstrip()) genome_seq[header] = ''.join(seq).upper() else: for fnam in f_names: fhandler = magic_open(fnam) try: while True: if not chr_names: header = fhandler.next() if header.startswith('>'): header = header[1:].split()[0] genome_seq[header] = '' break else: _ = fhandler.next() header = chr_names.pop(0) genome_seq[header] = '' break except StopIteration: raise Exception('No crocodiles found, is it fasta?') genome_seq[header] = ''.join([l.rstrip() for l in fhandler]).upper() return genome_seq
def parse_fasta(f_names, chr_names=None, verbose=True): """ Parse a list of fasta files, or just one fasta. WARNING: The order is important :param f_names: list of pathes to files, or just a single path :param None chr_names: pass list of chromosome names, or just one. If None are passed, then chromosome names will be inferred from fasta headers :returns: a sorted dictionary with chromosome names as keys, and sequences as values (sequence in upper case) """ if isinstance(f_names, str): f_names = [f_names] if isinstance(chr_names, str): chr_names = [chr_names] genome_seq = OrderedDict() if len(f_names) == 1: header = None seq = [] for line in magic_open(f_names[0]): if line.startswith(">"): if header: genome_seq[header] = "".join(seq).upper() if not chr_names: header = line[1:].split()[0] if verbose: print "Parsing %s" % (header) else: header = chr_names.pop(0) if verbose: print "Parsing %s as %s" % (line[1:].rstrip(), header) seq = [] continue seq.append(line.rstrip()) genome_seq[header] = "".join(seq).upper() else: for fnam in f_names: fhandler = magic_open(fnam) try: while True: if not chr_names: header = fhandler.next() if header.startswith(">"): header = header[1:].split()[0] genome_seq[header] = "" break else: _ = fhandler.next() header = chr_names.pop(0) genome_seq[header] = "" break except StopIteration: raise Exception("No crocodiles found, is it fasta?") genome_seq[header] = "".join([l.rstrip() for l in fhandler]).upper() return genome_seq
def _gem_filter(fnam, unmap_out, map_out): """ Divides reads in a map file in two categories: uniquely mapped, and not. Writes them in two files Notes: - GEM unique-maps can not be used as it gets rid of reads like 1:0:0:5 - not feasible with gt.filter """ fhandler = magic_open(fnam) if isinstance(fnam, str) else fnam unmap_out = open(unmap_out, 'w') map_out = open(map_out, 'w') for line in fhandler: matches = line.rsplit('\t', 2)[1] bad = False if matches != '1': for m in matches.replace('+', ':').split(':'): if m == '0': continue if m != '1': bad = True unmap_out.write(line) break break else: bad = True unmap_out.write(line) if not bad: map_out.write(line) unmap_out.close()
def get_mapped_chunk(map_folder, nreads): seqs = {} printime(' - loading chunk') pos_file = 0 for fname in os.listdir(map_folder): printime(' - ' + fname) fhandler = magic_open(os.path.join(map_folder, fname)) for line in fhandler: pos_file += 1 rid, seq, qal, _, pos = line.split() pos = int(pos.split(':')[2]) rid = rid.split('~')[0] seqs[rid, pos] = (seq, qal) if pos_file >= nreads: yield seqs printime(' - loading chunk') seqs = {} pos_file = 0 yield seqs
def _gem_filter(fnam, unmap_out, map_out): """ Divides reads in a map file in two categories: uniquely mapped, and not. Writes them in two files Notes: - GEM unique-maps can not be used as it gets rid of reads like 1:0:0:5 - not feasible with gt.filter """ fhandler = magic_open(fnam) if isinstance(fnam, basestring) else fnam unmap_out = open(unmap_out, 'w') map_out = open(map_out, 'w') def _strip_read_name(line): """ remove original sequence from read name when read is mapped uniquely """ header, line = line.split('\t', 1) return '\t'.join((header.rsplit(' ', 2)[0], line)) for line in fhandler: matches = line.rsplit('\t', 2)[1] bad = False if matches != '1': for m in matches.replace('+', ':').split(':'): if m == '0': continue if m != '1': bad = True unmap_out.write(line) break break else: bad = True unmap_out.write(line) if not bad: map_out.write(_strip_read_name(line)) unmap_out.close() map_out.close()
def _gem_filter(fnam, unmap_out, map_out): """ Divides reads in a map file in two categories: uniquely mapped, and not. Writes them in two files Notes: - GEM unique-maps can not be used as it gets rid of reads like 1:0:0:5 - not feasible with gt.filter """ fhandler = magic_open(fnam) if isinstance(fnam, str) else fnam unmap_out = open(unmap_out, 'w') map_out = open(map_out , 'w') def _strip_read_name(line): """ remove original sequence from read name when read is mapped uniquely """ header, line = line.split('\t', 1) return '\t'.join((header.rsplit(' ', 2)[0], line)) for line in fhandler: matches = line.rsplit('\t', 2)[1] bad = False if matches != '1': for m in matches.replace('+', ':').split(':'): if m == '0': continue if m != '1': bad = True unmap_out.write(line) break break else: bad = True unmap_out.write(line) if not bad: map_out.write(_strip_read_name(line)) unmap_out.close() map_out.close()
def get_intersection(fname1, fname2, out_path, verbose=False): """ Merges the two files corresponding to each reads sides. Reads found in both files are merged and written in an output file. Dealing with multiple contacts: - a pairwise contact is created for each possible combnation of the multicontacts. The name of the read is extended by '# 1/3' in case the reported pairwise contact corresponds to the first of 3 possibles - it may happen that different contacts are mapped on a single RE fragment (if each are on different end), in which case: - if no other fragment from this read are mapped than, both are kept - otherwise, they are merged into one longer (as if they were mapped in the positive strand) :param fname1: path to a tab separated file generated by the function :func:`pytadbit.parsers.sam_parser.parse_sam` :param fname2: path to a tab separated file generated by the function :func:`pytadbit.parsers.sam_parser.parse_sam` :param out_path: path to an outfile. It will written in a similar format as the inputs :returns: final number of pair of interacting fragments, and a dictionary with the number of multiple contacts (keys of the dictionary being the number of fragment cought together, can be 3, 4, 5..) """ # Get the headers of the two files reads1 = magic_open(fname1) line1 = reads1.next() header1 = '' while line1.startswith('#'): if line1.startswith('# CRM'): header1 += line1 line1 = reads1.next() read1 = line1.split('\t', 1)[0] reads2 = magic_open(fname2) line2 = reads2.next() header2 = '' while line2.startswith('#'): if line2.startswith('# CRM'): header2 += line2 line2 = reads2.next() read2 = line2.split('\t', 1)[0] if header1 != header2: raise Exception('seems to be mapped onover different chromosomes\n') # prepare to write read pairs into different files # depending on genomic position nchunks = 1024 global CHROM_START CHROM_START = {} cum_pos = 0 for line in header1.split('\n'): if line.startswith('# CRM'): _, _, crm, pos = line.split() CHROM_START[crm] = cum_pos cum_pos += int(pos) lchunk = cum_pos / nchunks buf = dict([(i, []) for i in xrange(nchunks + 1)]) # prepare temporary directories tmp_dir = out_path + '_tmp_files' mkdir(tmp_dir) for i in xrange(nchunks / int(nchunks**0.5) + 1): mkdir(path.join(tmp_dir, 'rep_%03d' % i)) # iterate over reads in each of the two input files # and store them into a dictionary and then into temporary files # dicitonary ois emptied each 1 milion entries if verbose: print ('Getting intersection of reads 1 and reads 2:') count = 0 count_dots = -1 multiples = {} try: while True: if verbose: if not count_dots % 10: stdout.write(' ') if not count_dots % 50: stdout.write('%s\n ' % ( (' %4d milion reads' % (count_dots)) if count_dots else '')) if count_dots >= 0: stdout.write('.') stdout.flush() count_dots += 1 for _ in xrange(1000000): # iterate 1 million times, write to files # same read id in both lianes, we store put the more upstream # before and store them if eq_reads(read1, read2): count += 1 _process_lines(line1, line2, buf, multiples, lchunk) line1 = reads1.next() read1 = line1.split('\t', 1)[0] line2 = reads2.next() read2 = line2.split('\t', 1)[0] # if first element of line1 is greater than the one of line2: elif gt_reads(read1, read2): line2 = reads2.next() read2 = line2.split('\t', 1)[0] else: line1 = reads1.next() read1 = line1.split('\t', 1)[0] write_to_files(buf, tmp_dir, nchunks) except StopIteration: reads1.close() reads2.close() write_to_files(buf, tmp_dir, nchunks) if verbose: print '\nFound %d pair of reads mapping uniquely' % count # sort each tmp file according to first element (idx) and write them # to output file (without the idx) # sort also according to read 2 (to filter duplicates) # and also according to strand if verbose: print 'Sorting each temporary file by genomic coordinate' out = open(out_path, 'w') out.write(header1) for b in buf: if verbose: stdout.write('\r %4d/%d sorted files' % (b + 1, len(buf))) stdout.flush() out.write(''.join(['\t'.join(l[1:]) for l in sorted( [l.split('\t') for l in open( path.join(tmp_dir, 'rep_%03d' % (b / int(nchunks**0.5)), 'tmp_%05d.tsv' % b))], key=lambda x: (x[0], x[8], x[9], x[6]))])) out.close() if verbose: print '\nRemoving temporary files...' system('rm -rf ' + tmp_dir) return count, multiples
def transform_fastq(fastq_path, out_fastq, trim=None, r_enz=None, add_site=True, min_seq_len=15, fastq=True, verbose=True, light_storage=False, **kwargs): """ Given a FASTQ file it can split it into chunks of a given number of reads, trim each read according to a start/end positions or split them into restriction enzyme fragments :param True add_site: when splitting the sequence by ligated sites found, removes the ligation site, and put back the original RE site. """ skip = kwargs.get('skip', False) ## define local functions to process reads and sequences def _get_fastq_read_heavy(rlines): """ returns header and sequence of 1 FASTQ entry Note: header also contains the sequence """ rlines = rlines.rstrip('\n').split()[0][1:] seq = fhandler.next() _ = fhandler.next() # lose qualities header but not needed qal = fhandler.next() # lose qualities but not needed # header now also contains original read return (rlines + ' ' + seq.strip() + ' ' + qal.strip(), seq.strip(), qal.strip()) def _get_fastq_read_light(rlines): """ returns header and sequence of 1 FASTQ entry Note: header also contains the sequence """ rlines = rlines.rstrip('\n').split()[0][1:] seq = fhandler.next() _ = fhandler.next() # lose qualities header but not needed qal = fhandler.next() # lose qualities but not needed return (rlines, seq.strip(), qal.strip()) def _get_map_read_heavy(line): header = line.split('\t', 1)[0] seq, qal = header.rsplit(' ', 2)[-2:] return header, seq, qal def _get_map_read_light(line): header, seq, qal, _ = line.split('\t', 3) return header, seq, qal def _inverse_find(seq, pat): try: return pat.search(seq).start() except AttributeError: return 'nan' def find_patterns(seq, patterns): pos, pat = min((_inverse_find(seq, patterns[p]), p) for p in patterns) return int(pos), pat def _split_read_re(seq, qal, patterns, site, max_seq_len=None, cnt=0): """ Recursive generator that splits reads according to the predefined restriction enzyme. RE fragments yielded are followed and preceded by the RE site if a ligation site was found after the fragment. EXAMPLE: seq = '-------oGATCo========oGATCGATCo_____________oGATCGATCo~~~~~~~~~~~~' qal = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' should yield these fragments: -------oGATCo========oGATC xxxxxxxxxxxxxxxxxxxxxxHHHH GATCo_____________oGATC HHHHxxxxxxxxxxxxxxxHHHH GATCo~~~~~~~~~~~~ HHHHxxxxxxxxxxxxx :param seq: sequence of the read fragment :param qal: quality of the sequence of the read fragment :param patterns: list of patterns of the ligated cut sites :param None max_seq_len: to control that all reads are bellow this length :param '' site: non-ligated cut site to replace ligation site :param 0 cnt: to count number of fragments :yields: seq fragments, their qualities and their count, or index (higher than 0 if ligation sites are found) """ cnt += 1 try: pos, (r_enz1, r_enz2) = find_patterns(seq, patterns) except ValueError: if len(seq) == max_seq_len: raise ValueError if len(seq) > min_seq_len: yield seq, qal, cnt return # add quality before corresponding to the space occupied by the cut-site xqal1 = ('H' * len(site[r_enz1])) xqal2 = ('H' * len(site[r_enz2])) if pos < min_seq_len: split_read(site[r_enz2] + seq[pos + len_relgs[(r_enz1, r_enz2)]:], xqal2 + qal[pos + len_relgs[(r_enz1, r_enz2)]:], patterns, no_site, max_seq_len, cnt=cnt) else: yield seq[:pos] + site[r_enz1], qal[:pos] + xqal1, cnt new_pos = pos + len_relgs[(r_enz1, r_enz2)] for sseq, sqal, cnt in split_read(site[r_enz2] + seq[new_pos:], xqal2 + qal[new_pos:], patterns, site, max_seq_len, cnt=cnt): yield sseq, sqal, cnt # Define function for stripping lines according to focus if isinstance(trim, tuple): beg, end = trim beg -= 1 strip_line = lambda x: x[beg:end] else: strip_line = lambda x: x # define function to split reads according to restriction enzyme sites if isinstance(r_enz, str): r_enzs = [r_enz] elif isinstance(r_enz, list): r_enzs = r_enz else: r_enzs = None if r_enzs: enzymes = {} enz_patterns = {} for r_enz in r_enzs: enzymes[r_enz] = RESTRICTION_ENZYMES[r_enz].replace('|', '') enz_patterns = religateds(r_enzs) sub_enz_patterns = {} len_relgs = {} for r_enz1, r_enz2 in enz_patterns: sub_enz_patterns[(r_enz1, r_enz2)] = ( enz_patterns[(r_enz1, r_enz2)][:len(enz_patterns[(r_enz1, r_enz2)]) / 2]) len_relgs[(r_enz1, r_enz2)] = len(enz_patterns[(r_enz1, r_enz2)]) print ' - splitting into restriction enzyme (RE) fragments using ligation sites' print ' - ligation sites are replaced by RE sites to match the reference genome' for r_enz1 in r_enzs: for r_enz2 in r_enzs: print ' * enzymes: %s & %s, ligation site: %s, RE site: %s & %s' % ( r_enz1, r_enz2, enz_patterns[(r_enz1, r_enz2)], enzymes[r_enz1], enzymes[r_enz2]) # replace pattern with regex to support IUPAC annotation for ezp in enz_patterns: enz_patterns[ezp] = re.compile(iupac2regex(enz_patterns[ezp])) for ezp in sub_enz_patterns: sub_enz_patterns[ezp] = iupac2regex(sub_enz_patterns[ezp]) split_read = _split_read_re else: enzymes = '' enz_patterns = '' sub_enz_patterns = '' split_read = lambda x, y, z, after_z, after_after_z: (yield x, y , 1) # function to yield reads from input file if light_storage: get_seq = _get_fastq_read_light if fastq else _get_map_read_light insert_mark = insert_mark_light else: get_seq = _get_fastq_read_heavy if fastq else _get_map_read_heavy insert_mark = insert_mark_heavy ## Start processing the input file if verbose: print 'Preparing %s file' % ('FASTQ' if fastq else 'MAP') if fastq: print ' - conversion to MAP format' if trim: print ' - trimming reads %d-%d' % tuple(trim) counter = 0 if skip: if fastq: print ' ... skipping, only counting lines' counter = sum(1 for _ in magic_open(fastq_path, cpus=kwargs.get('nthreads'))) counter /= 4 if fastq else 1 print ' ' + fastq_path, counter, fastq return out_fastq, counter # open input file fhandler = magic_open(fastq_path, cpus=kwargs.get('nthreads')) # create output file out_name = out_fastq out = open(out_fastq, 'w') # iterate over reads and strip them no_site = dict([(r_enz, '') for r_enz in enzymes]) site = enzymes if add_site else no_site for header in fhandler: header, seq, qal = get_seq(header) counter += 1 # trim on wanted region of the read seq = strip_line(seq) qal = strip_line(qal) # get the generator of restriction enzyme fragments iter_frags = split_read(seq, qal, enz_patterns, site, len(seq)) # the first fragment should not be preceded by the RE site try: seq, qal, cnt = iter_frags.next() except StopIteration: # read full of ligation events, fragments not reaching minimum continue except ValueError: # or not ligation site found, in which case we try with half # ligation site in case there was a sequencing error (half ligation # site is a RE site or nearly, and thus should not be found anyway) iter_frags = split_read(seq, qal, sub_enz_patterns, no_site, len(seq)) try: seq, qal, cnt = iter_frags.next() except ValueError: continue except StopIteration: continue out.write(_map2fastq('\t'.join((insert_mark(header, cnt), seq, qal, '0', '-\n')))) # the next fragments should be preceded by the RE site # continue for seq, qal, cnt in iter_frags: out.write(_map2fastq('\t'.join((insert_mark(header, cnt), seq, qal, '0', '-\n')))) out.close() return out_name, counter
def parse_bed(fnam, resolution=1): """ simple BED and BEDgraph parser that only checks for the fields 1, 2, 3 and 5 (or 1, 2 and 3 if 5 not availbale). .. note:: 2 or 3 columns files can also be passed and will be interpreted, respectively, as chromosome/begin and chromosome/begin/end :param fnam: path to BED file :param 1 resolution: to bin the resulting dictionary :returns: a dictionnary with a count of number of entries found per bin. In case column 5 is present the values used tyo weight entries, otherwise each entry will weight 1. """ fhandler = magic_open(fnam) line = fhandler.next() fpos = len(line) while (line.startswith('#') or line.startswith('track') or line.startswith('browser')): fpos += len(line) line = fhandler.next() ################## # check file type try: # classic BED _, _, _, _, val, _ = line.split('\t', 5) try: float(val) parse_line = _bed_float except ValueError: parse_line = _bed_one except ValueError: try: # BEDgraph _, _, _, val = line.split('\t', 5) parse_line = _bedgraph_float except ValueError: try: # BEDgraph with no values _, _, _ = line.split() parse_line = _3_col except ValueError: # only chromosome and begin position available parse_line = _2_col #################################### # go back to first informative line # parse dico = {} fhandler.seek(fpos) for line in fhandler: crm, beg, end, val = parse_line(line) pos = (beg + end - beg) / resolution dico.setdefault(crm, {}) dico[crm].setdefault(pos, 0) dico[crm][pos] += val return dico
def parse_map(f_names1, f_names2=None, out_file1=None, out_file2=None, genome_seq=None, re_name=None, verbose=False, **kwargs): """ Parse map files Keep a summary of the results into 2 tab-separated files that will contain 6 columns: read ID, Chromosome, position, strand (either 0 or 1), mapped sequence lebgth, position of the closest upstream RE site, position of the closest downstream RE site :param f_names1: a list of path to sam/bam files corresponding to the mapping of read1, can also be just one file :param f_names1: a list of path to sam/bam files corresponding to the mapping of read2, can also be just one file :param out_file1: path to outfile tab separated format containing mapped read1 information :param out_file1: path to outfile tab separated format containing mapped read2 information :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`. containing the genomic sequence :param re_name: name of the restriction enzyme used """ # not nice, dirty fix in order to allow this function to only parse # one SAM file if not out_file1: raise Exception('ERROR: out_file1 should be given\n') if not re_name: raise Exception('ERROR: re_name should be given\n') if not genome_seq: raise Exception('ERROR: genome_seq should be given\n') if (f_names2 and not out_file2) or (not f_names2 and out_file2): raise Exception('ERROR: out_file2 AND f_names2 needed\n') frag_chunk = kwargs.get('frag_chunk', 100000) if verbose: print 'Searching and mapping RE sites to the reference genome' frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk, verbose=verbose) if isinstance(f_names1, str): f_names1 = [f_names1] if isinstance(f_names2, str): f_names2 = [f_names2] if f_names2: fnames = f_names1, f_names2 outfiles = out_file1, out_file2 else: fnames = (f_names1, ) outfiles = (out_file1, ) for read in range(len(fnames)): if verbose: print 'Loading read' + str(read + 1) windows = {} tmp_name = '/'.join(outfiles[read].split('/') [:-1]) + '/tmp_' + outfiles[read].split('/')[-1] tmp_reads_fh = open(tmp_name, 'w') sorter = Popen(['sort', '-k', '1,1', '-s', '-t', '\t'], stdin=PIPE, stdout=tmp_reads_fh) num = 0 for fnam in fnames[read]: try: fhandler = magic_open(fnam) except IOError: warn('WARNING: file "%s" not found\n' % fnam) continue # get the iteration number of the iterative mapping try: num = int(fnam.split('.')[-1].split(':')[0]) except: num += 1 windows.setdefault(num, 0) if verbose: print 'loading file: %s' % (fnam) # iteration over reads for r in fhandler: name, seq, _, _, ali = r.split('\t')[:5] crm, strand, pos = ali.split(':')[:3] positive = strand == '+' len_seq = len(seq) if positive: pos = int(pos) else: pos = int( pos) + len_seq - 1 # remove 1 because all inclusive try: frag_piece = frags[crm][pos / frag_chunk] except KeyError: # Chromosome not in hash continue idx = bisect(frag_piece, pos) try: next_re = frag_piece[idx] except IndexError: # case where part of the read is mapped outside chromosome count = 0 while idx >= len(frag_piece) and count < len_seq: pos -= 1 count += 1 frag_piece = frags[crm][pos / frag_chunk] idx = bisect(frag_piece, pos) if count >= len_seq: raise Exception('Read mapped mostly outside ' + 'chromosome\n') next_re = frag_piece[idx] prev_re = frag_piece[idx - 1 if idx else 0] sorter.stdin.write( '%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % (name, crm, pos, positive, len_seq, prev_re, next_re)) windows[num] += 1 if verbose: print 'finishing to sort' sorter.communicate() tmp_reads_fh.close() if verbose: print 'Getting Multiple contacts' reads_fh = open(outfiles[read], 'w') ## Also pipe file header # chromosome sizes (in order) reads_fh.write('# Chromosome lengths (order matters):\n') for crm in genome_seq: reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm]))) reads_fh.write('# Mapped\treads count by iteration\n') for size in windows: reads_fh.write('# MAPPED %d %d\n' % (size, windows[size])) ## Multicontacts tmp_reads_fh = open(tmp_name) read = tmp_reads_fh.next() prev_head = read.split('\t', 1)[0] prev_read = read.strip() for read in tmp_reads_fh: head = read.split('\t', 1)[0] if head == prev_head: prev_read += '|||' + read.strip() else: reads_fh.write(prev_read + '\n') prev_read = read.strip() prev_head = head reads_fh.write(prev_read + '\n') reads_fh.close()
def get_intersection(fname1, fname2, out_path, verbose=False, compress=False): """ Merges the two files corresponding to each reads sides. Reads found in both files are merged and written in an output file. Dealing with multiple contacts: - a pairwise contact is created for each possible combnation of the multicontacts. The name of the read is extended by '# 1/3' in case the reported pairwise contact corresponds to the first of 3 possibles - it may happen that different contacts are mapped on a single RE fragment (if each are on different end), in which case: - if no other fragment from this read are mapped than, both are kept - otherwise, they are merged into one longer (as if they were mapped in the positive strand) :param fname1: path to a tab separated file generated by the function :func:`pytadbit.parsers.sam_parser.parse_sam` :param fname2: path to a tab separated file generated by the function :func:`pytadbit.parsers.sam_parser.parse_sam` :param out_path: path to an outfile. It will written in a similar format as the inputs :param False compress: compress (gzip) input files. This is done in the background while next input files are parsed. :returns: final number of pair of interacting fragments, and a dictionary with the number of multiple contacts (keys of the dictionary being the number of fragment cought together, can be 3, 4, 5..) """ # Get the headers of the two files reads1 = magic_open(fname1) line1 = next(reads1) header1 = '' while line1.startswith('#'): if line1.startswith('# CRM'): header1 += line1 line1 = next(reads1) read1 = line1.split('\t', 1)[0] reads2 = magic_open(fname2) line2 = next(reads2) header2 = '' while line2.startswith('#'): if line2.startswith('# CRM'): header2 += line2 line2 = next(reads2) read2 = line2.split('\t', 1)[0] if header1 != header2: raise Exception('seems to be mapped onover different chromosomes\n') # prepare to write read pairs into different files # depending on genomic position nchunks = 1024 global CHROM_START CHROM_START = {} cum_pos = 0 for line in header1.split('\n'): if line.startswith('# CRM'): _, _, crm, pos = line.split() CHROM_START[crm] = cum_pos cum_pos += int(pos) lchunk = cum_pos // nchunks buf = dict([(i, []) for i in range(nchunks + 1)]) # prepare temporary directories tmp_dir = out_path + '_tmp_files' mkdir(tmp_dir) for i in range(nchunks // int(nchunks**0.5) + 1): mkdir(path.join(tmp_dir, 'rep_%03d' % i)) # iterate over reads in each of the two input files # and store them into a dictionary and then into temporary files # dicitonary ois emptied each 1 milion entries if verbose: print ('Getting intersection of reads 1 and reads 2:') count = 0 count_dots = -1 multiples = {} try: while True: if verbose: if not count_dots % 10: stdout.write(' ') if not count_dots % 50: stdout.write('%s\n ' % ( (' %4d milion reads' % (count_dots)) if count_dots else '')) if count_dots >= 0: stdout.write('.') stdout.flush() count_dots += 1 for _ in range(1000000): # iterate 1 million times, write to files # same read id in both lianes, we store put the more upstream # before and store them if eq_reads(read1, read2): count += 1 _process_lines(line1, line2, buf, multiples, lchunk) line1 = next(reads1) read1 = line1.split('\t', 1)[0] line2 = next(reads2) read2 = line2.split('\t', 1)[0] # if first element of line1 is greater than the one of line2: elif gt_reads(read1, read2): line2 = next(reads2) read2 = line2.split('\t', 1)[0] else: line1 = next(reads1) read1 = line1.split('\t', 1)[0] write_to_files(buf, tmp_dir, nchunks) except StopIteration: reads1.close() reads2.close() write_to_files(buf, tmp_dir, nchunks) if verbose: print('\nFound %d pair of reads mapping uniquely' % count) # compression if compress: if verbose: print('compressing input files') procs = [Popen(['gzip', f]) for f in (fname1, fname2)] # sort each tmp file according to first element (idx) and write them # to output file (without the idx) # sort also according to read 2 (to filter duplicates) # and also according to strand if verbose: print('Sorting each temporary file by genomic coordinate') out = open(out_path, 'w') out.write(header1) for b in buf: if verbose: stdout.write('\r %4d/%d sorted files' % (b + 1, len(buf))) stdout.flush() with open(path.join(tmp_dir, 'rep_%03d' % (b // int(nchunks**0.5)), 'tmp_%05d.tsv' % b)) as f_tmp: out.write(''.join(['\t'.join(l[1:]) for l in sorted( [l.split('\t') for l in f_tmp], key=lambda x: (x[0], x[8], x[9], x[6]))])) out.close() if compress: for proc in procs: proc.communicate() system('rm -rf ' + fname1) system('rm -rf ' + fname2) if verbose: print('\nRemoving temporary files...') system('rm -rf ' + tmp_dir) return count, multiples
def quality_plot(fnam, r_enz=None, nreads=float('inf'), axe=None, savefig=None, paired=False): """ Plots the sequencing quality of a given FASTQ file. If a restrinction enzyme (RE) name is provided, can also represent the distribution of digested and undigested RE sites and estimate an expected proportion of dangling-ends. Proportion of dangling-ends is inferred by counting the number of times a dangling-end site, is found at the beginning of any of the reads (divided by the number of reads). :param fnam: path to FASTQ file :param None nreads: max number of reads to read, not necesary to read all :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param False paired: is input FASTQ contains both ends :returns: the percentage of dangling-ends (sensu stricto) and the percentage of reads with at least a ligation site. """ phred = dict([(c, i) for i, c in enumerate( '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~' )]) if isinstance(r_enz, list): r_enzs = r_enz elif isinstance(r_enz, basestring): r_enzs = [r_enz] for k in list(RESTRICTION_ENZYMES.keys()): for i in range(len(r_enzs)): if k.lower() == str(r_enz[i]).lower(): r_enz[i] = k # else let it as None quals = [] henes = [] sites = {} fixes = {} liges = OrderedDict() ligep = {} tkw = dict(size=4, width=1.5) fhandler = magic_open(fnam) if len(r_enzs) == 1 and r_enzs[0] is None: if nreads: while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) if len(quals) > nreads: break else: # do this because it's faster while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) else: r_sites = {} d_sites = {} for r_enz in r_enzs: r_sites[r_enz] = RESTRICTION_ENZYMES[r_enz].replace('|', '') d_sites[r_enz] = repaired(r_enz) sites[r_enz] = [] # initialize dico to store undigested sites fixes[r_enz] = [] # initialize dico to store digested sites l_sites = religateds(r_enzs) l_sites = OrderedDict((k, iupac2regex(l_sites[k])) for k in l_sites) site = {} fixe = {} for r_enz in r_enzs: site[r_enz] = re.compile(iupac2regex(r_sites[r_enz])) fixe[r_enz] = re.compile(iupac2regex(d_sites[r_enz])) # ligation sites should appear in lower case in the sequence lige = {} for k in l_sites: liges[k] = [] # initialize dico to store sites ligep[k] = 0 # initialize dico to store sites l_sites[k] = l_sites[k].lower() lige[k] = re.compile(l_sites[k]) callback = lambda pat: pat.group(0).lower() while len(quals) <= nreads: try: next(fhandler) except StopIteration: break seq = next(fhandler) # ligation sites replaced by lower case to ease the search for lig in list(l_sites.values()): seq = re.sub(lig.upper(), callback, seq) for r_enz in r_enzs: sites[r_enz].extend( [m.start() for m in site[r_enz].finditer(seq)]) # TODO: you cannot have a repaired/fixed site in the middle of # the sequence, this could be only checked at the beginning fixes[r_enz].extend( [m.start() for m in fixe[r_enz].finditer(seq)]) for k in lige: # for each paired of cut-site liges[k].extend([m.start() for m in lige[k].finditer(seq)]) if lige[k].search(seq): ligep[k] += 1 # store the number of Ns found in the sequences if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) fhandler.close() if not nreads: nreads = len(quals) quals = zip_longest(*quals, fillvalue=float('nan')) meanquals, errorquals = list(zip(*[(nanmean(q), nanstd(q)) for q in quals])) max_seq_len = len(meanquals) if axe: ax = axe fig = axe.get_figure() ax2 = fig.add_subplot(212) else: # configure plot if len(r_enzs) == 1 and r_enzs[0] is None: # do both plots _, ax = plt.subplots(1, 1, figsize=(15, 6)) else: # only do the quality_plot plot _, (ax, ax2) = plt.subplots(2, 1, figsize=(15, 12)) ax.patch.set_facecolor('lightgrey') ax.patch.set_alpha(0.4) ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax.set_axisbelow(True) # remove tick marks ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False) ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False, which='minor') ax.errorbar(list(range(max_seq_len)), meanquals, linewidth=1, elinewidth=1, color='darkblue', yerr=errorquals, ecolor='orange') ax.set_xlim((0, max_seq_len)) ax.set_xlabel('Nucleotidic position') ax.set_ylabel('PHRED score') ax.set_title('Sequencing Quality (%d reads)' % (nreads)) ax.yaxis.label.set_color('darkblue') ax.tick_params(axis='y', colors='darkblue', **tkw) axb = ax.twinx() # quality_plot plot axb.plot([henes.count(i) for i in range(max_seq_len)], linewidth=1, color='black', linestyle='--') axb.yaxis.label.set_color('black') axb.tick_params(axis='y', colors='black', **tkw) axb.set_ylabel('Number of "N" per position') try: # no Ns found (yes... it happens) axb.set_yscale('log') with catch_warnings(): simplefilter("ignore") axb.set_ylim((0, axb.get_ylim()[1] * 1000)) except ValueError: axb.set_yscale('linear') ax.set_ylim((0, ax.get_ylim()[1])) ax.set_xlim((0, max_seq_len)) # Hi-C plot if not (len(r_enzs) == 1 and r_enzs[0] is None): ax.set_title('Sequencing Quality and deconvolution (%s %d reads)' % (', '.join(map(str, r_enzs)), nreads)) ax.set_xlabel('') plt.setp(ax.get_xticklabels(), visible=False) ax2.patch.set_facecolor('lightgrey') ax2.patch.set_alpha(0.4) ax2.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax2.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax2.set_axisbelow(True) ax2.set_xlabel('Nucleotidic position') # seq_len is the length of the line to plot. we don't want to plot # if there is no room for the cut-site, or ligation site. site_len = max((max([len(r_sites[k]) for k in r_sites]), max([len(l_sites[k]) for k in l_sites]), max([len(d_sites[k]) for k in d_sites]))) seq_len = max_seq_len - site_len # transform dictionaries of positions into dictionaries of counts for r_enz in sites: sites[r_enz] = [sites[r_enz].count(k) for k in range(seq_len)] # Undigested fixes[r_enz] = [fixes[r_enz].count(k) for k in range(seq_len)] # DE for r1, r2 in liges: liges[(r1, r2)] = [liges[(r1, r2)].count(k) for k in range(seq_len)] # OK # in case the pattern of the repaired cut-site contains the target # cut-site pattern. These sites were counted twice, once in the # undigested, and once in the repaired. We remove them from the # repaired: for r_enz in r_enzs: if d_sites[r_enz] in r_sites[r_enz]: pos = r_sites[r_enz].find(d_sites[r_enz]) fixes[r_enz] = (fixes[r_enz][:pos] + [ fixes[r_enz][k] - sites[r_enz][k - pos] for k in range(pos, seq_len) ]) # same for ligated sites for r_enz1 in r_enzs: for r_enz2 in r_enzs: if d_sites[r_enz1] not in l_sites[(r_enz1, r_enz2)]: continue pos = l_sites[(r_enz1, r_enz2)].find(d_sites[r_enz1]) fixes[r_enz1] = (fixes[r_enz1][:pos] + [ fixes[r_enz1][k] - liges[(r_enz1, r_enz2)][k - pos] for k in range(pos, seq_len) ]) # remove anything that could be in between the two read ends if paired: for k in sites: sites[k][max_seq_len // 2 - site_len:max_seq_len // 2] = [float('nan')] * site_len fixes[k][max_seq_len // 2 - site_len:max_seq_len // 2] = [float('nan')] * site_len for k in liges: liges[k][max_seq_len // 2 - site_len:max_seq_len // 2] = [float('nan')] * site_len # plot undigested cut-sites color = iter(plt.cm.Reds(linspace(0.3, 0.95, len(r_enzs)))) for r_enz in sites: # print 'undigested', r_enz # print sites[r_enz][:20] ax2.plot( sites[r_enz], linewidth=2, color=next(color), alpha=0.9, label='Undigested RE site (%s: %s)' % (r_enz, r_sites[r_enz]) if any([f > 0 for f in fixes[r_enz]]) else 'Undigested & Dangling-Ends (%s: %s)' % (r_enz, r_sites[r_enz])) ax2.set_ylabel('Undigested') ax2.yaxis.label.set_color('darkred') ax2.tick_params(axis='y', colors='darkred', **tkw) lines, labels = ax2.get_legend_handles_labels() ax3 = ax2.twinx() color = iter(plt.cm.Blues(linspace(0.3, 0.95, len(liges)))) for r1, r2 in liges: # print 'ligated', r1, r2 # print liges[(r1, r2)][:20] ax3.plot(liges[(r1, r2)], linewidth=2, color=next(color), alpha=0.9, label='Ligated (%s-%s: %s)' % (r1, r2, l_sites[(r1, r2)].upper())) ax3.yaxis.label.set_color('darkblue') ax3.tick_params(axis='y', colors='darkblue', **tkw) ax3.set_ylabel('Ligated') tmp_lines, tmp_labels = ax3.get_legend_handles_labels() lines.extend(tmp_lines) labels.extend(tmp_labels) color = iter(plt.cm.Greens(linspace(0.3, 0.95, len(r_enzs)))) for i, r_enz in enumerate(r_enzs): if any([f > 0 for f in fixes[r_enz]]): ax4 = ax2.twinx() ax4.spines["right"].set_position(("axes", 1.07)) make_patch_spines_invisible(ax4) ax4.spines["right"].set_visible(True) # print 'repaired', r_enz # print fixes[r_enz][:20] ax4.plot(fixes[r_enz], linewidth=2, color=next(color), alpha=0.9, label='Dangling-ends (%s: %s)' % (r_enz, d_sites[r_enz])) ax4.yaxis.label.set_color('darkgreen') ax4.tick_params(axis='y', colors='darkgreen', **tkw) ax4.set_ylabel('Dangling-ends') tmp_lines, tmp_labels = ax4.get_legend_handles_labels() lines.extend(tmp_lines) labels.extend(tmp_labels) else: ax2.set_ylabel('Undigested & Dangling-ends') ax2.set_xlim((0, max_seq_len)) # Count ligation sites lig_cnt = {} for k in liges: lig_cnt[k] = (nansum(liges[k]) - liges[k][0] - liges[k][max_seq_len // 2]) # Count undigested sites sit_cnt = {} for r_enz in r_enzs: sit_cnt[r_enz] = (nansum(sites[r_enz]) - sites[r_enz][0] - sites[r_enz][max_seq_len // 2]) # Count Dangling-Ends des = {} for r_enz in r_enzs: if any([f > 0 for f in fixes[r_enz]]): des[r_enz] = ( (100. * (fixes[r_enz][0] + (fixes[r_enz][(max_seq_len // 2)] if paired else 0))) / nreads) else: des[r_enz] = (100. * (sites[r_enz][0] + (sites[r_enz][ (max_seq_len // 2)] if paired else 0))) / nreads # Decorate plot title = '' for r_enz in r_enzs: lcnt = float( sum([ lig_cnt[(r_enz1, r_enz2)] * (2 if r_enz1 == r_enz2 else 1) for r_enz1 in r_enzs for r_enz2 in r_enzs if r_enz1 == r_enz or r_enz2 == r_enz ])) title += ( 'Percentage of digested sites (not considering Dangling-Ends) ' '%s: %.1f%%\n' % (r_enz, 100. * float(lcnt) / (lcnt + sit_cnt[r_enz]))) for r_enz in r_enzs: title += 'Percentage of dangling-ends %s: %.1f%%\n' % (r_enz, des[r_enz]) for r_enz1 in r_enzs: for r_enz2 in r_enzs: title += ( 'Percentage of reads with ligation site (%s-%s): %.1f%% \n' % (r_enz1, r_enz2, (ligep[(r_enz1, r_enz2)] * 100.) / nreads)) plt.title(title.strip(), size=10, ha='left', x=0) plt.subplots_adjust(right=0.85) ax2.legend(lines, labels, bbox_to_anchor=(0.75, 1.0), loc=3, borderaxespad=0., frameon=False, fontsize=9) plt.tight_layout() if savefig: tadbit_savefig(savefig) plt.close('all') elif not axe: plt.show() for k in ligep: ligep[k] = (ligep[k] * 100.) / nreads if len(r_enzs) == 1 and r_enzs[0] is None: return {}, {} return des, ligep
def transform_fastq(fastq_path, out_fastq, trim=None, r_enz=None, add_site=True, min_seq_len=15, fastq=True, verbose=True, light_storage=False, **kwargs): """ Given a FASTQ file it can split it into chunks of a given number of reads, trim each read according to a start/end positions or split them into restriction enzyme fragments :param True add_site: when splitting the sequence by ligated sites found, removes the ligation site, and put back the original RE site. """ skip = kwargs.get('skip', False) ## define local functions to process reads and sequences def _get_fastq_read_heavy(rlines): """ returns header and sequence of 1 FASTQ entry Note: header also contains the sequence """ rlines = rlines.rstrip('\n').split()[0][1:] seq = fhandler.next() _ = fhandler.next() # lose qualities header but not needed qal = fhandler.next() # lose qualities but not needed # header now also contains original read return (rlines + ' ' + seq.strip() + ' ' + qal.strip(), seq.strip(), qal.strip()) def _get_fastq_read_light(rlines): """ returns header and sequence of 1 FASTQ entry Note: header also contains the sequence """ rlines = rlines.rstrip('\n').split()[0][1:] seq = fhandler.next() _ = fhandler.next() # lose qualities header but not needed qal = fhandler.next() # lose qualities but not needed return (rlines, seq.strip(), qal.strip()) def _get_map_read_heavy(line): header = line.split('\t', 1)[0] seq, qal = header.rsplit(' ', 2)[-2:] return header, seq, qal def _get_map_read_light(line): header, seq, qal, _ = line.split('\t', 3) return header, seq, qal def _split_read_re(seq, qal, pattern, max_seq_len=None, site='', cnt=0): """ Recursive generator that splits reads according to the predefined restriction enzyme. RE fragments yielded are followed and preceded by the RE site if a ligation site was found after the fragment. EXAMPLE: seq = '-------oGATCo========oGATCGATCo_____________oGATCGATCo~~~~~~~~~~~~' qal = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' should yield these fragments: -------oGATCo========oGATC xxxxxxxxxxxxxxxxxxxxxxHHHH GATCo_____________oGATC HHHHxxxxxxxxxxxxxxxHHHH GATCo~~~~~~~~~~~~ HHHHxxxxxxxxxxxxx """ cnt += 1 try: pos = seq.index(pattern) except ValueError: if len(seq) == max_seq_len: raise ValueError if len(seq) > min_seq_len: yield seq, qal, cnt return xqal = ('H' * len(site)) if pos < min_seq_len: split_read(site + seq[pos + len_relg:], xqal + qal[pos + len_relg:], pattern, max_seq_len, cnt=cnt) else: yield seq[:pos] + site, qal[:pos] + xqal, cnt new_pos = pos + len_relg for sseq, sqal, cnt in split_read(site + seq[new_pos:], xqal + qal[new_pos:], pattern, max_seq_len, site=site, cnt=cnt): yield sseq, sqal, cnt # Define function for stripping lines according to focus if isinstance(trim, tuple): beg, end = trim beg -= 1 strip_line = lambda x: x[beg:end] else: strip_line = lambda x: x # define function to split reads according to restriction enzyme sites if isinstance(r_enz, str): enzyme = RESTRICTION_ENZYMES[r_enz].replace('|', '') enz_pattern = religated(r_enz) sub_enz_pattern = enz_pattern[:len(enz_pattern) / 2] len_relg = len(enz_pattern) print ' - splitting into restriction enzyme (RE) fragments using ligation sites' print ' - ligation sites are replaced by RE sites to match the reference genome' print ' * enzyme: %s, ligation site: %s, RE site: %s' % (r_enz, enz_pattern, enzyme) split_read = _split_read_re else: enzyme = '' enz_pattern = '' split_read = lambda x, y, z, after_z, after_after_z: (yield x, y , 1) # function to yield reads from input file if light_storage: get_seq = _get_fastq_read_light if fastq else _get_map_read_light insert_mark = insert_mark_light else: get_seq = _get_fastq_read_heavy if fastq else _get_map_read_heavy insert_mark = insert_mark_heavy ## Start processing the input file if verbose: print 'Preparing %s file' % ('FASTQ' if fastq else 'MAP') if fastq: print ' - conversion to MAP format' if trim: print ' - trimming reads %d-%d' % tuple(trim) counter = 0 if skip: if fastq: print ' ... skipping, only counting lines' counter = sum(1 for _ in magic_open(fastq_path, cpus=kwargs.get('nthreads'))) counter /= 4 if fastq else 1 print ' ' + fastq_path, counter, fastq return out_fastq, counter # open input file fhandler = magic_open(fastq_path, cpus=kwargs.get('nthreads')) # create output file out_name = out_fastq out = open(out_fastq, 'w') # iterate over reads and strip them site = enzyme if add_site else '' for header in fhandler: header, seq, qal = get_seq(header) counter += 1 # trim on wanted region of the read seq = strip_line(seq) qal = strip_line(qal) # get the generator of restriction enzyme fragments iter_frags = split_read(seq, qal, enz_pattern, len(seq), site) # the first fragment should not be preceded by the RE site try: seq, qal, cnt = iter_frags.next() except StopIteration: # read full of ligation events, fragments not reaching minimum continue except ValueError: # or not ligation site found, in which case we try with half # ligation site in case there was a sequencing error (half ligation # site is a RE site or nearly, and thus should not be found anyway) iter_frags = split_read(seq, qal, sub_enz_pattern, len(seq), '') try: seq, qal, cnt = iter_frags.next() except ValueError: continue except StopIteration: continue out.write(_map2fastq('\t'.join((insert_mark(header, cnt), seq, qal, '0', '-\n')))) # the next fragments should be preceded by the RE site # continue for seq, qal, cnt in iter_frags: out.write(_map2fastq('\t'.join((insert_mark(header, cnt), seq, qal, '0', '-\n')))) out.close() return out_name, counter
def parse_map(f_names1, f_names2=None, out_file1=None, out_file2=None, genome_seq=None, re_name=None, verbose=False, clean=True, **kwargs): """ Parse map files Keep a summary of the results into 2 tab-separated files that will contain 6 columns: read ID, Chromosome, position, strand (either 0 or 1), mapped sequence lebgth, position of the closest upstream RE site, position of the closest downstream RE site :param f_names1: a list of path to sam/bam files corresponding to the mapping of read1, can also be just one file :param f_names1: a list of path to sam/bam files corresponding to the mapping of read2, can also be just one file :param out_file1: path to outfile tab separated format containing mapped read1 information :param out_file1: path to outfile tab separated format containing mapped read2 information :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`. containing the genomic sequence :param re_name: name of the restriction enzyme used :param True clean: remove temporary files required for indentification of multiple-contacts """ # not nice, dirty fix in order to allow this function to only parse # one SAM file if not out_file1: raise Exception('ERROR: out_file1 should be given\n') if not re_name: raise Exception('ERROR: re_name should be given\n') if not genome_seq: raise Exception('ERROR: genome_seq should be given\n') if (f_names2 and not out_file2) or (not f_names2 and out_file2): raise Exception('ERROR: out_file2 AND f_names2 needed\n') frag_chunk = kwargs.get('frag_chunk', 100000) if verbose: print 'Searching and mapping RE sites to the reference genome' frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk, verbose=verbose) if isinstance(f_names1, str): f_names1 = [f_names1] if isinstance(f_names2, str): f_names2 = [f_names2] if f_names2: fnames = f_names1, f_names2 outfiles = out_file1, out_file2 else: fnames = (f_names1,) outfiles = (out_file1, ) for read in range(len(fnames)): if verbose: print 'Loading read' + str(read + 1) windows = {} tmp_name = os.path.join(*outfiles[read].split('/')[:-1] + ['tmp_' + outfiles[read].split('/')[-1]]) tmp_name = ('/' * outfiles[read].startswith('/')) + tmp_name tmp_reads_fh = open(tmp_name, 'w') sorter = Popen(['sort', '-k', '1,1', '-s', '-t', '\t'], stdin=PIPE, stdout=tmp_reads_fh) num = 0 for fnam in fnames[read]: try: fhandler = magic_open(fnam) except IOError: warn('WARNING: file "%s" not found\n' % fnam) continue # get the iteration number of the iterative mapping try: num = int(fnam.split('.')[-1].split(':')[0]) except: num += 1 windows.setdefault(num, 0) if verbose: print 'loading file: %s' % (fnam) # iteration over reads for r in fhandler: name, seq, _, _, ali = r.split('\t')[:5] crm, strand, pos = ali.split(':')[:3] positive = strand == '+' len_seq = len(seq) if positive: pos = int(pos) else: pos = int(pos) + len_seq - 1 # remove 1 because all inclusive try: frag_piece = frags[crm][pos / frag_chunk] except KeyError: # Chromosome not in hash continue idx = bisect(frag_piece, pos) try: next_re = frag_piece[idx] except IndexError: # case where part of the read is mapped outside chromosome count = 0 while idx >= len(frag_piece) and count < len_seq: pos -= 1 count += 1 frag_piece = frags[crm][pos / frag_chunk] idx = bisect(frag_piece, pos) if count >= len_seq: raise Exception('Read mapped mostly outside ' + 'chromosome\n') next_re = frag_piece[idx] prev_re = frag_piece[idx - 1 if idx else 0] sorter.stdin.write('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % ( name, crm, pos, positive, len_seq, prev_re, next_re)) windows[num] += 1 if verbose: print 'finishing to sort' sorter.communicate() tmp_reads_fh.close() if verbose: print 'Getting Multiple contacts' reads_fh = open(outfiles[read], 'w') ## Also pipe file header # chromosome sizes (in order) reads_fh.write('# Chromosome lengths (order matters):\n') for crm in genome_seq: reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm]))) reads_fh.write('# Mapped\treads count by iteration\n') for size in windows: reads_fh.write('# MAPPED %d %d\n' % (size, windows[size])) ## Multicontacts tmp_reads_fh = open(tmp_name) read = tmp_reads_fh.next() prev_head = read.split('\t', 1)[0] prev_head = prev_head.split('~' , 1)[0] prev_read = read for read in tmp_reads_fh: head = read.split('\t', 1)[0] head = head.split('~' , 1)[0] if head == prev_head: prev_read = prev_read.strip() + '|||' + read else: reads_fh.write(prev_read) prev_read = read prev_head = head reads_fh.write(prev_read) reads_fh.close() if clean: os.system('rm -rf ' + tmp_name)
def transform_fastq(fastq_path, out_fastq, trim=None, r_enz=None, add_site=True, min_seq_len=15, fastq=True, verbose=True): """ Given a FASTQ file it can split it into chunks of a given number of reads, trim each read according to a start/end positions or split them into restriction enzyme fragments :param True add_site: when splitting the sequence by ligated sites found, removes the ligation site, and put back the original RE site. """ ## define local funcitons to process reads and sequences def _get_fastq_read(rlines): """ returns header and sequence of 1 FASTQ entry Note: header also contains the sequence """ rlines = rlines.rstrip('\n').split()[0][1:] seq = fhandler.next() _ = fhandler.next() # lose qualities header but not needed qal = fhandler.next() # lose qualities but not needed # header now also contains original read return (rlines + ' ' + seq.strip() + ' ' + qal.strip(), seq.strip(), qal.strip()) def _get_map_read(line): header = line.split('\t', 1)[0] seq, qal = header.rsplit(' ', 2)[-2:] return header, seq, qal def _split_read_re(seq, qal, pattern, max_seq_len=None, site='', cnt=0): """ Recursive generator that splits reads according to the predefined restriction enzyme. RE fragments yielded are followed by the RE site if a ligation site was found after the fragment. The RE site before the fragment is added outside this function """ try: cnt += 1 pos = seq.index(pattern) if pos < min_seq_len: split_read(seq[pos + len_relg:], qal[pos + len_relg:], pattern, max_seq_len, cnt=cnt) else: yield seq[:pos] + site, qal[:pos] + ('H' * len(site)), cnt for subseq, subqal, cnt in split_read(seq[pos + len_relg:], qal[pos + len_relg:], pattern, max_seq_len, cnt=cnt): yield subseq, subqal, cnt except ValueError: if len(seq) == max_seq_len: raise ValueError if len(seq) > min_seq_len: yield seq, qal, cnt # Define function for stripping lines according to ficus if isinstance(trim, tuple): beg, end = trim beg -= 1 strip_line = lambda x: x[beg:end] else: strip_line = lambda x: x # define function to split reads according to restriction enzyme sites if isinstance(r_enz, str): enzyme = RESTRICTION_ENZYMES[r_enz].replace('|', '') enz_pattern = religated(r_enz) sub_enz_pattern = enz_pattern[:len(enz_pattern) / 2] len_relg = len(enz_pattern) print ' - splitting into restriction enzyme (RE) fragments using ligation sites' print ' - ligation sites are replaced by RE sites to match the reference genome' print ' * enzyme: %s, ligation site: %s, RE site: %s' % (r_enz, enz_pattern, enzyme) split_read = _split_read_re else: enz_pattern = '' split_read = lambda x, y, z, after_z, after_after_z: (yield x, y , 1) # function to yield reads from input file get_seq = _get_fastq_read if fastq else _get_map_read ## Start processing the input file if verbose: print 'Preparing %s file' % ('FASTQ' if fastq else 'MAP') if fastq: print ' - conversion to MAP format' if trim: print ' - triming reads %d-%d' % tuple(trim) # open input file fhandler = magic_open(fastq_path) # create output file out_name = out_fastq out = open(out_fastq, 'w') # iterate over reads and strip them site = '' if add_site else enzyme for header in fhandler: header, seq, qal = get_seq(header) # trim on wanted region of the read seq = strip_line(seq) qal = strip_line(qal) # get the generator of restriction enzyme fragments iter_frags = split_read(seq, qal, enz_pattern, len(seq), site) # the first fragment should not be preceded by the RE site try: seq, qal, cnt = iter_frags.next() except StopIteration: # read full of ligation events, fragments not reaching minimum continue except ValueError: # or not ligation site found, in which case we try with half # ligation site in case there was a sequencing error (half ligation # site is a RE site or nearly, and thus should not be found anyway) iter_frags = split_read(seq, qal, sub_enz_pattern, len(seq), '') try: seq, qal, cnt = iter_frags.next() except ValueError: continue except StopIteration: continue out.write(_map2fastq('\t'.join((insert_mark(header, cnt), seq, qal, '0', '-\n')))) # the next fragments should be preceded by the RE site # continue for seq, qal, cnt in iter_frags: out.write(_map2fastq('\t'.join((insert_mark(header, cnt), seq + site, qal + 'H' * (len(site)), '0', '-\n')))) out.close() return out_name
def parse_fasta(f_names, chr_names=None, chr_filter=None, chr_regexp=None, verbose=True, save_cache=True, reload_cache=False, only_length=False): """ Parse a list of fasta files, or just one fasta. WARNING: The order is important :param f_names: list of pathes to files, or just a single path :param None chr_names: pass list of chromosome names, or just one. If None are passed, then chromosome names will be inferred from fasta headers :param None chr_filter: use only chromosome in the input list :param None chr_regexp: use only chromosome matching :param True save_cache: save a cached version of this file for faster loadings (~4 times faster) :param False reload_cache: reload cached genome :param False only_length: returns dictionary with length of genome,not sequence :returns: a sorted dictionary with chromosome names as keys, and sequences as values (sequence in upper case) """ if isinstance(f_names, str): f_names = [f_names] if len(f_names) == 1: fname = f_names[0] + '_genome.TADbit' else: fname = path.join(path.commonprefix(f_names), 'genome.TADbit') if path.exists(fname) and not reload_cache: if verbose: print 'Loading cached genome' genome_seq = OrderedDict() for line in open(fname): if line.startswith('>'): c = line[1:].strip() else: if only_length: genome_seq[c] = len(line.strip()) else: genome_seq[c] = line.strip() return genome_seq if isinstance(chr_names, str): chr_names = [chr_names] if chr_filter: bad_chrom = lambda x: not x in chr_filter else: bad_chrom = lambda x: False if chr_regexp: chr_regexp = re.compile(chr_regexp) else: chr_regexp = re.compile('.*') genome_seq = OrderedDict() if len(f_names) == 1: header = None seq = [] for line in magic_open(f_names[0]): if line.startswith('>'): if header: genome_seq[header] = ''.join(seq).upper() header = line[1:].split()[0] if bad_chrom(header) or not chr_regexp.match(header): header = 'UNWANTED' elif not chr_names: if verbose: print 'Parsing %s' % (header) else: header = chr_names.pop(0) if verbose: print 'Parsing %s as %s' % (line[1:].rstrip(), header) seq = [] continue seq.append(line.rstrip()) if only_length: genome_seq[header] = len(seq) else: genome_seq[header] = ''.join(seq).upper() if 'UNWANTED' in genome_seq: del (genome_seq['UNWANTED']) else: for fnam in f_names: fhandler = magic_open(fnam) try: while True: if not chr_names: header = fhandler.next() if header.startswith('>'): header = header[1:].split()[0] if bad_chrom( header) or not chr_regexp.match(header): header = 'UNWANTED' genome_seq[header] = '' break else: _ = fhandler.next() header = chr_names.pop(0) if bad_chrom(header): header = 'UNWANTED' genome_seq[header] = '' break except StopIteration: raise Exception('No crocodiles found, is it fasta?') if only_length: genome_seq[header] = sum(len(l.rstrip()) for l in fhandler) else: genome_seq[header] = ''.join([l.rstrip() for l in fhandler]).upper() if 'UNWANTED' in genome_seq: del (genome_seq['UNWANTED']) if save_cache and not only_length: if verbose: print 'saving genome in cache' if len(f_names) == 1: fname = f_names[0] + '_genome.TADbit' else: fname = path.join(path.commonprefix(f_names), 'genome.TADbit') out = open(fname, 'w') for c in genome_seq: out.write('>%s\n%s\n' % (c, genome_seq[c])) out.close() return genome_seq
def transform_fastq(fastq_path, out_fastq, trim=None, r_enz=None, add_site=True, min_seq_len=15, fastq=True, verbose=True): """ Given a FASTQ file it can split it into chunks of a given number of reads, trim each read according to a start/end positions or split them into restriction enzyme fragments :param True add_site: when splitting the sequence by ligated sites found, removes the ligation site, and put back the original RE site. """ ## define local funcitons to process reads and sequences def _get_fastq_read(rlines): """ returns header and sequence of 1 FASTQ entry Note: header also contains the sequence """ rlines = rlines.rstrip('\n').split()[0][1:] seq = fhandler.next() _ = fhandler.next() # lose qualities header but not needed qal = fhandler.next() # lose qualities but not needed # header now also contains original read return (rlines + ' ' + seq.strip() + ' ' + qal.strip(), seq.strip(), qal.strip()) def _get_map_read(line): header = line.split('\t', 1)[0] seq, qal = header.rsplit(' ', 2)[-2:] return header, seq, qal def _split_read_re(seq, qal, pattern, max_seq_len=None, site=''): """ Recursive generator that splits reads according to the predefined restriction enzyme. RE fragments yielded are followed by the RE site if a ligation site was found after the fragment. The RE site before the fragment is added outside this function """ try: pos = seq.index(pattern) if pos < min_seq_len: split_read(seq[pos + len_relg:], qal[pos + len_relg:], pattern, max_seq_len) else: yield seq[:pos] + site, qal[:pos] + ('H' * len(site)) for subseq, subqal in split_read(seq[pos + len_relg:], qal[pos + len_relg:], pattern, max_seq_len): yield subseq, subqal except ValueError: if len(seq) == max_seq_len: raise ValueError if len(seq) > min_seq_len: yield seq, qal # Define function for stripping lines according to ficus if isinstance(trim, tuple): beg, end = trim beg -= 1 strip_line = lambda x: x[beg:end] else: strip_line = lambda x: x # define function to split reads according to restriction enzyme sites if isinstance(r_enz, str): enzyme = RESTRICTION_ENZYMES[r_enz].replace('|', '') enz_pattern = religated(r_enz) sub_enz_pattern = enz_pattern[:len(enz_pattern) / 2] len_relg = len(enz_pattern) print ' - splitting into restriction enzyme (RE) fragments using ligation sites' print ' - ligation sites are replaced by RE sites to match the reference genome' print ' * enzyme: %s, ligation site: %s, RE site: %s' % ( r_enz, enz_pattern, enzyme) split_read = _split_read_re else: enz_pattern = '' split_read = lambda x, y, z, after_z, after_after_z: (yield x, y) # function to yield reads from input file get_seq = _get_fastq_read if fastq else _get_map_read ## Start processing the input file if verbose: print 'Preparing %s file' % ('FASTQ' if fastq else 'MAP') if fastq: print ' - conversion to MAP format' if trim: print ' - triming reads %d-%d' % tuple(trim) # open input file fhandler = magic_open(fastq_path) # create output file out_name = out_fastq out = open(out_fastq, 'w') # iterate over reads and strip them site = '' if add_site else enzyme for header in fhandler: header, seq, qal = get_seq(header) # trim on wanted region of the read seq = strip_line(seq) qal = strip_line(qal) # get the generator of restriction enzyme fragments iter_frags = split_read(seq, qal, enz_pattern, len(seq), site) # the first fragment should not be preceded by the RE site try: seq, qal = iter_frags.next() except StopIteration: # read full of ligation events, fragments not reaching minimum continue except ValueError: # or not ligation site found, in which case we try with half # ligation site in case there was a sequencing error (half ligation # site is a RE site or nearly, and thus should not be found anyway) iter_frags = split_read(seq, qal, sub_enz_pattern, len(seq), site) try: seq, qal = iter_frags.next() except ValueError: continue except StopIteration: continue out.write(_map2fastq('\t'.join((header, seq, qal, '0', '-\n')))) # the next fragments should be preceded by the RE site # continue for seq, qal in iter_frags: out.write( _map2fastq('\t'.join((header, seq + site, qal + 'H' * (len(site)), '0', '-\n')))) out.close() return out_name
def parse_fasta(f_names, chr_names=None, chr_filter=None, chr_regexp=None, verbose=True, save_cache=True, reload_cache=False, only_length=False): """ Parse a list of fasta files, or just one fasta. WARNING: The order is important :param f_names: list of pathes to files, or just a single path :param None chr_names: pass list of chromosome names, or just one. If None are passed, then chromosome names will be inferred from fasta headers :param None chr_filter: use only chromosome in the input list :param None chr_regexp: use only chromosome matching :param True save_cache: save a cached version of this file for faster loadings (~4 times faster) :param False reload_cache: reload cached genome :param False only_length: returns dictionary with length of genome,not sequence :returns: a sorted dictionary with chromosome names as keys, and sequences as values (sequence in upper case) """ if isinstance(f_names, str): f_names = [f_names] if len(f_names) == 1: fname = f_names[0] + '_genome.TADbit' else: fname = path.join(path.commonprefix(f_names), 'genome.TADbit') if path.exists(fname) and not reload_cache: if verbose: print 'Loading cached genome' genome_seq = OrderedDict() for line in open(fname): if line.startswith('>'): c = line[1:].strip() else: if only_length: genome_seq[c] = len(line.strip()) else: genome_seq[c] = line.strip() return genome_seq if isinstance(chr_names, str): chr_names = [chr_names] if chr_filter: bad_chrom = lambda x: not x in chr_filter else: bad_chrom = lambda x: False if chr_regexp: chr_regexp = re.compile(chr_regexp) else: chr_regexp = re.compile('.*') genome_seq = OrderedDict() if len(f_names) == 1: header = None seq = [] for line in magic_open(f_names[0]): if line.startswith('>'): if header: genome_seq[header] = ''.join(seq).upper() header = line[1:].split()[0] if bad_chrom(header) or not chr_regexp.match(header): header = 'UNWANTED' elif not chr_names: if verbose: print 'Parsing %s' % (header) else: header = chr_names.pop(0) if verbose: print 'Parsing %s as %s' % (line[1:].rstrip(), header) seq = [] continue seq.append(line.rstrip()) if only_length: genome_seq[header] = len(seq) else: genome_seq[header] = ''.join(seq).upper() if 'UNWANTED' in genome_seq: del(genome_seq['UNWANTED']) else: for fnam in f_names: fhandler = magic_open(fnam) try: while True: if not chr_names: header = fhandler.next() if header.startswith('>'): header = header[1:].split()[0] if bad_chrom(header) or not chr_regexp.match(header): header = 'UNWANTED' genome_seq[header] = '' break else: _ = fhandler.next() header = chr_names.pop(0) if bad_chrom(header): header = 'UNWANTED' genome_seq[header] = '' break except StopIteration: raise Exception('No crocodiles found, is it fasta?') if only_length: genome_seq[header] = sum(len(l.rstrip()) for l in fhandler) else: genome_seq[header] = ''.join([l.rstrip() for l in fhandler]).upper() if 'UNWANTED' in genome_seq: del(genome_seq['UNWANTED']) if save_cache and not only_length: if verbose: print 'saving genome in cache' if len(f_names) == 1: fname = f_names[0] + '_genome.TADbit' else: fname = path.join(path.commonprefix(f_names), 'genome.TADbit') out = open(fname, 'w') for c in genome_seq: out.write('>%s\n%s\n' % (c, genome_seq[c])) out.close() return genome_seq
def save_to_db(opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig, nbad_columns, ncolumns, scc, std, reprod, eigen_corr_dat, eigen_corr_fig, outbed, corr, eig_corr, biases1, biases2, masked1, masked2, launch_time, finish_time): if 'tmpdb' in opts and opts.tmpdb: # check lock while path.exists(path.join(opts.workdir, '__lock_db')): time.sleep(0.5) # close lock open(path.join(opts.workdir, '__lock_db'), 'a').close() # tmp file dbfile = opts.tmpdb try: # to copy in case read1 was already mapped for example copyfile(path.join(opts.workdir, 'trace.db'), dbfile) except IOError: pass else: dbfile = path.join(opts.workdir, 'trace.db') con = lite.connect(dbfile) with con: cur = con.cursor() cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='MERGE_OUTPUTs'""") if not cur.fetchall(): cur.execute(""" create table PATHs (Id integer primary key, JOBid int, Path text, Type text, unique (Path))""") cur.execute(""" create table JOBs (Id integer primary key, Parameters text, Launch_time text, Finish_time text, Type text, Parameters_md5 text, unique (Parameters_md5))""") cur.execute(""" create table FILTER_OUTPUTs (Id integer primary key, PATHid int, Name text, Count int, JOBid int, unique (PATHid))""") cur.execute(""" create table MERGE_OUTPUTs (Id integer primary key, JOBid int, Wrkd1Path int, Wrkd2Path int, Bed1Path int, Bed2Path int, MergePath int, unique (JOBid))""") cur.execute(""" create table MERGE_STATs (Id integer primary key, JOBid int, Inputs text, decay_corr text, eigen_corr text, reprod real, scc real, std_scc real, N_columns int, N_filtered int, Resolution int, bias1Path int, bias2Path int, unique (JOBid))""") try: parameters = digest_parameters(opts, get_md5=False) param_hash = digest_parameters(opts, get_md5=True ) cur.execute(""" insert into JOBs (Id , Parameters, Launch_time, Finish_time, Type , Parameters_md5) values (NULL, '%s', '%s', '%s', 'Merge', '%s') """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time), time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash)) except lite.IntegrityError: pass jobid = get_jobid(cur) add_path(cur, decay_corr_dat, 'CORR' , jobid, opts.workdir) add_path(cur, decay_corr_fig, 'FIGURE' , jobid, opts.workdir) add_path(cur, eigen_corr_dat, 'CORR' , jobid, opts.workdir) add_path(cur, eigen_corr_fig, 'FIGURE' , jobid, opts.workdir) add_path(cur, opts.workdir , 'WORKDIR' , jobid) add_path(cur, opts.workdir1, 'WORKDIR1' , jobid, opts.workdir) add_path(cur, opts.workdir2, 'WORKDIR2' , jobid, opts.workdir) add_path(cur, mreads1 , 'EXT_HIC_BAM', jobid, opts.workdir) add_path(cur, mreads2 , 'EXT_HIC_BAM', jobid, opts.workdir) if not opts.skip_merge: add_path(cur, outbed , 'HIC_BAM' , jobid, opts.workdir) if opts.norm: add_path(cur, biases1 , 'BIASES' , jobid, opts.workdir) add_path(cur, biases2 , 'BIASES' , jobid, opts.workdir) biasid1 = get_path_id(cur, biases1, opts.workdir) biasid2 = get_path_id(cur, biases2, opts.workdir) else: biasid1 = 0 biasid2 = 0 cur.execute("select id from paths where path = '%s'" % ( path.relpath(mreads1, opts.workdir))) bed1 = cur.fetchall()[0][0] if opts.workdir1: cur.execute("select id from paths where path = '%s'" % ( path.relpath(opts.workdir1, opts.workdir))) w1path = cur.fetchall()[0][0] else: w1path = 0 cur.execute("select id from paths where path = '%s'" % ( path.relpath(mreads2, opts.workdir))) bed2 = cur.fetchall()[0][0] if opts.workdir2: cur.execute("select id from paths where path = '%s'" % ( path.relpath(opts.workdir2, opts.workdir))) w2path = cur.fetchall()[0][0] else: w2path = 0 if not opts.skip_merge: cur.execute("select id from paths where path = '%s'" % ( path.relpath(outbed, opts.workdir))) outbedid = cur.fetchall()[0][0] if not opts.skip_comparison: decay_corr = '-'.join(['%.1f' % (v) for v in corr[:10:2]]).replace('0.', '.') eigen_corr = '-'.join(['%.2f' % (max(v)) for v in eig_corr[:4]]).replace('0.', '.') else: decay_corr = eigen_corr = None if not opts.skip_merge: cur.execute(""" insert into MERGE_OUTPUTs (Id , JOBid, Wrkd1Path, Wrkd2Path, Bed1Path, Bed2Path, MergePath) values (NULL, %d, %d, %d, %d, %d, %d) """ % (jobid, w1path, w2path, bed1, bed2, outbedid)) if not opts.skip_comparison: cur.execute(""" insert into MERGE_STATs (Id , JOBid, N_columns, N_filtered, Resolution, reprod, scc, std_scc, decay_corr, eigen_corr, bias1Path, bias2Path) values (NULL, %d, %d, %d, %d, %f, %f, %f, '%s', '%s', %d, %d) """ % (jobid, ncolumns, nbad_columns, opts.reso , reprod, scc, std, decay_corr, eigen_corr, biasid1, biasid2)) if opts.workdir1: if 'tmpdb' in opts and opts.tmpdb: # tmp file dbfile1 = opts.tmpdb1 try: # to copy in case read1 was already mapped for example copyfile(path.join(opts.workdir1, 'trace.db'), dbfile1) except IOError: pass else: dbfile1 = path.join(opts.workdir1, 'trace.db') tmpcon = lite.connect(dbfile1) with tmpcon: tmpcur = tmpcon.cursor() tmpcur.execute("select Name, PATHid, Count from filter_outputs") for name, pathid, count in tmpcur.fetchall(): res = tmpcur.execute("select Path from PATHs where Id = %d" % (pathid)) tmppath = res.fetchall()[0][0] masked1[name] = {'path': tmppath, 'count': count} if 'tmpdb' in opts and opts.tmpdb: remove(dbfile1) if opts.workdir2: if 'tmpdb' in opts and opts.tmpdb: # tmp file dbfile2 = opts.tmpdb2 try: # to copy in case read2 was already mapped for example copyfile(path.join(opts.workdir2, 'trace.db'), dbfile2) except IOError: pass else: dbfile2 = path.join(opts.workdir2, 'trace.db') tmpcon = lite.connect(dbfile2) with tmpcon: tmpcur = tmpcon.cursor() tmpcur.execute("select Name, PATHid, Count from filter_outputs") for name, pathid, count in tmpcur.fetchall(): res = tmpcur.execute("select Path from PATHs where Id = %d" % (pathid)) tmppath = res.fetchall()[0][0] masked2[name] = {'path': tmppath, 'count': count} if 'tmpdb' in opts and opts.tmpdb: remove(dbfile2) for f in masked1: if f != 'valid-pairs': outmask = path.join(opts.workdir, '03_filtered_reads', 'all_r1-r2_intersection_%s.tsv_%s.tsv' % ( param_hash, f)) out = open(outmask, 'w') try: fh = magic_open(path.join(opts.workdir1, masked1[f]['path'])) except FileNotFoundError: fh = magic_open(path.join(opts.workdir1, masked1[f]['path'] + '.gz')) for line in fh: out.write(line) try: fh = magic_open(path.join(opts.workdir2, masked2[f]['path'])) except FileNotFoundError: fh = magic_open(path.join(opts.workdir2, masked2[f]['path'] + '.gz')) for line in fh: out.write(line) add_path(cur, outmask, 'FILTER', jobid, opts.workdir) else: if opts.skip_merge: outmask = 'NA' else: outmask = outbed try: path_id = get_path_id(cur, outmask, opts.workdir) except IndexError: path_id = -1 cur.execute(""" insert into FILTER_OUTPUTs (Id , PATHid, Name, Count, JOBid) values (NULL, %d, '%s', '%s', %d) """ % (path_id, f, masked1[f]['count'] + masked2[f]['count'], jobid)) print_db(cur, 'PATHs') print_db(cur, 'JOBs') print_db(cur, 'MERGE_OUTPUTs') print_db(cur, 'MERGE_STATs') print_db(cur, 'FILTER_OUTPUTs') if 'tmpdb' in opts and opts.tmpdb: # copy back file copyfile(dbfile, path.join(opts.workdir, 'trace.db')) remove(dbfile) # release lock try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass
def transform_fastq(fastq_path, out_fastq, trim=None, r_enz=None, min_seq_len=20, fastq=True, verbose=True): """ Given a FASTQ file it can split it into chunks of a given number of reads, trim each read according to a start/end positions or split them into restriction enzyme fragments """ ## define local funcitons to process reads and sequences def _get_fastq_read(rlines): """ returns header and sequence of 1 FASTQ entry """ rlines = rlines.rstrip('\n') line = fhandler.next() _ = fhandler.next() # lose qualities but not needed _ = fhandler.next() # lose qualities but not needed return rlines, line.strip() def _split_read_re(x, max_seq_len=None): """ Recursive generator that splits reads according to the predefined restriction enzyme. RE fragments yielded are followed by the RE site if a ligation site was found after the fragment. The RE site before the fragment is added outside this function """ try: pos = x.index(enz_pattern) if pos < min_seq_len: split_read(x[pos + len_relg:], max_seq_len) else: yield x[:pos] + enzyme for x in split_read(x[pos + len_relg:], max_seq_len): yield x except ValueError: if len(x) > min_seq_len: if len(x) == max_seq_len: raise StopIteration yield x # Define function for stripping lines according to ficus if isinstance(trim, tuple): beg, end = trim strip_line = lambda x: x[beg:end] else: strip_line = lambda x: x # define function to split reads according to restriction enzyme sites if isinstance(r_enz, str): enzyme = RESTRICTION_ENZYMES[r_enz].replace('|', '') enz_pattern = religated(r_enz) len_relg = len(enz_pattern) print ' - splitting into restriction enzyme (RE) fragments using ligation sites' print ' - ligation sites are replaced by RE sites to match the reference genome' print ' * enzyme: %s, ligation site: %s, RE site: %s' % (r_enz, enz_pattern, enzyme) split_read = _split_read_re else: split_read = lambda x, y: (yield x) # function to yield reads from input file get_seq = _get_fastq_read if fastq else lambda x: x.split('\t', 2)[:2] ## Start processing the input file if verbose: print 'Preparing %s file' % ('FASTQ' if fastq else 'MAP') if fastq: print ' - conversion to MAP format' if trim: print ' - triming reads %d-%d' % tuple(trim) # open input file fhandler = magic_open(fastq_path) # create output file out_name = out_fastq out = open(out_fastq, 'w') # iterate over reads and strip them for header in fhandler: header, line = get_seq(header) # trim on wanted region of the read line = strip_line(line) # get the generator of restriction enzyme fragments iter_frags = split_read(line, len(line)) # the first fragment should not be preceded by the RE site try: frag = iter_frags.next() except StopIteration: # read full of ligation events, fragments not reaching minimum continue out.write('\t'.join((header, frag, 'H' * len(frag), '0', '-\n'))) # the next fragments should be preceded by the RE site for frag in iter_frags: out.write('\t'.join((header, frag + enzyme, 'H' * (len(frag) + len(enzyme)), '0', '-\n'))) out.close() return out_name
def parse_map(f_names1, f_names2=None, out_file1=None, out_file2=None, genome_seq=None, re_name=None, verbose=False, clean=True, **kwargs): """ Parse map files Keep a summary of the results into 2 tab-separated files that will contain 6 columns: read ID, Chromosome, position, strand (either 0 or 1), mapped sequence lebgth, position of the closest upstream RE site, position of the closest downstream RE site. The position of reads mapped on reverse strand will be computed from the end of the read (original position + read length - 1) :param f_names1: a list of path to sam/bam files corresponding to the mapping of read1, can also be just one file :param f_names2: a list of path to sam/bam files corresponding to the mapping of read2, can also be just one file :param out_file1: path to outfile tab separated format containing mapped read1 information :param out_file2: path to outfile tab separated format containing mapped read2 information :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`. containing the genomic sequence :param re_name: name of the restriction enzyme used :param True clean: remove temporary files required for indentification of multiple-contacts :param False compress: compress (gzip) input map files. This is done in the background while next MAP files are parsed, or while files are sorted. """ # not nice, dirty fix in order to allow this function to only parse # one SAM file if not out_file1: raise Exception('ERROR: out_file1 should be given\n') if not re_name: raise Exception('ERROR: re_name should be given\n') if not genome_seq: raise Exception('ERROR: genome_seq should be given\n') if (f_names2 and not out_file2) or (not f_names2 and out_file2): raise Exception('ERROR: out_file2 AND f_names2 needed\n') frag_chunk = kwargs.get('frag_chunk', 100000) if verbose: print 'Searching and mapping RE sites to the reference genome' frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk, verbose=verbose) if isinstance(f_names1, str): f_names1 = [f_names1] if isinstance(f_names2, str): f_names2 = [f_names2] if f_names2: fnames = f_names1, f_names2 outfiles = out_file1, out_file2 else: fnames = (f_names1,) outfiles = (out_file1, ) # max number of reads per intermediate files for sorting max_size = 1000000 windows = {} multis = {} procs = [] for read in range(len(fnames)): if verbose: print 'Loading read' + str(read + 1) windows[read] = {} num = 0 # iteration over reads nfile = 0 tmp_files = [] reads = [] for fnam in fnames[read]: try: fhandler = magic_open(fnam) except IOError: warn('WARNING: file "%s" not found\n' % fnam) continue # get the iteration number of the iterative mapping try: num = int(fnam.split('.')[-1].split(':')[0]) except: num += 1 # set read counter if verbose: print 'loading file: %s' % (fnam) # start parsing read_count = 0 try: while not False: for _ in xrange(max_size): try: reads.append(read_read(fhandler.next(), frags, frag_chunk)) except KeyError: # Chromosome not in hash continue read_count += 1 nfile += 1 write_reads_to_file(reads, outfiles[read], tmp_files, nfile) except StopIteration: fhandler.close() nfile += 1 write_reads_to_file(reads, outfiles[read], tmp_files, nfile) windows[read][num] = read_count if kwargs.get('compress', False) and fnam.endswith('.map'): print 'compressing input MAP file' procs.append(Popen(['gzip', fnam])) nfile += 1 write_reads_to_file(reads, outfiles[read], tmp_files, nfile) # we have now sorted temporary files # we do merge sort for eah pair if verbose: stdout.write('Merge sort') stdout.flush() while len(tmp_files) > 1: file1 = tmp_files.pop(0) try: file2 = tmp_files.pop(0) except IndexError: break if verbose: stdout.write('.') stdout.flush() nfile += 1 tmp_files.append(merge_sort(file1, file2, outfiles[read], nfile)) if verbose: stdout.write('\n') tmp_name = tmp_files[0] if verbose: print 'Getting Multiple contacts' reads_fh = open(outfiles[read], 'w') ## Also pipe file header # chromosome sizes (in order) reads_fh.write('# Chromosome lengths (order matters):\n') for crm in genome_seq: reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm]))) reads_fh.write('# Mapped\treads count by iteration\n') for size in windows[read]: reads_fh.write('# MAPPED %d %d\n' % (size, windows[read][size])) ## Multicontacts tmp_reads_fh = open(tmp_name) try: read_line = tmp_reads_fh.next() except StopIteration: raise StopIteration('ERROR!\n Nothing parsed, check input files and' ' chromosome names (in genome.fasta and SAM/MAP' ' files).') prev_head = read_line.split('\t', 1)[0] prev_head = prev_head.split('~' , 1)[0] prev_read = read_line multis[read] = 0 for read_line in tmp_reads_fh: head = read_line.split('\t', 1)[0] head = head.split('~' , 1)[0] if head == prev_head: multis[read] += 1 prev_read = prev_read.strip() + '|||' + read_line else: reads_fh.write(prev_read) prev_read = read_line prev_head = head reads_fh.write(prev_read) reads_fh.close() if clean: os.system('rm -rf ' + tmp_name) # wait for compression to finish for p in procs: p.communicate() return windows, multis
def parse_fasta(f_names, chr_names=None, chr_filter=None, chr_regexp=None, verbose=True): """ Parse a list of fasta files, or just one fasta. WARNING: The order is important :param f_names: list of pathes to files, or just a single path :param None chr_names: pass list of chromosome names, or just one. If None are passed, then chromosome names will be inferred from fasta headers :param None chr_filter: use only chromosome in the input list :param None chr_regexp: use only chromosome matching :returns: a sorted dictionary with chromosome names as keys, and sequences as values (sequence in upper case) """ if isinstance(f_names, str): f_names = [f_names] if isinstance(chr_names, str): chr_names = [chr_names] if chr_filter: bad_chrom = lambda x: not x in chr_filter else: bad_chrom = lambda x: False if chr_regexp: chr_regexp = re.compile(chr_regexp) else: chr_regexp = re.compile('.*') genome_seq = OrderedDict() if len(f_names) == 1: header = None seq = [] for line in magic_open(f_names[0]): if line.startswith('>'): if header: genome_seq[header] = ''.join(seq).upper() header = line[1:].split()[0] if bad_chrom(header) or not chr_regexp.match(header): header = 'UNWANTED' elif not chr_names: if verbose: print 'Parsing %s' % (header) else: header = chr_names.pop(0) if verbose: print 'Parsing %s as %s' % (line[1:].rstrip(), header) seq = [] continue seq.append(line.rstrip()) genome_seq[header] = ''.join(seq).upper() if 'UNWANTED' in genome_seq: del(genome_seq['UNWANTED']) else: for fnam in f_names: fhandler = magic_open(fnam) try: while True: if not chr_names: header = fhandler.next() if header.startswith('>'): header = header[1:].split()[0] if bad_chrom(header) or not chr_regexp.match(header): header = 'UNWANTED' genome_seq[header] = '' break else: _ = fhandler.next() header = chr_names.pop(0) if bad_chrom(header): header = 'UNWANTED' genome_seq[header] = '' break except StopIteration: raise Exception('No crocodiles found, is it fasta?') genome_seq[header] = ''.join([l.rstrip() for l in fhandler]).upper() if 'UNWANTED' in genome_seq: del(genome_seq['UNWANTED']) return genome_seq