示例#1
0
def parse_fasta(f_names, chr_names=None, verbose=True):
    """
    Parse a list of fasta files, or just one fasta.

    WARNING: The order is important

    :param f_names: list of pathes to files, or just a single path
    :param None chr_names: pass list of chromosome names, or just one. If None
       are passed, then chromosome names will be inferred from fasta headers

    :returns: a sorted dictionary with chromosome names as keys, and sequences
       as values (sequence in upper case)
    """
    if isinstance(f_names, str):
        f_names = [f_names]
    if isinstance(chr_names, str):
        chr_names = [chr_names]

    genome_seq = OrderedDict()
    if len(f_names) == 1:
        header = None
        seq = []
        for line in magic_open(f_names[0]):
            if line.startswith('>'):
                if header:
                    genome_seq[header] = ''.join(seq).upper()
                if not chr_names:
                    header = line[1:].split()[0]
                    if verbose:
                        print 'Parsing %s' % (header)
                else:
                    header = chr_names.pop(0)
                    if verbose:
                        print 'Parsing %s as %s' % (line[1:].rstrip(), header)
                seq = []
                continue
            seq.append(line.rstrip())
        genome_seq[header] = ''.join(seq).upper()
    else:
        for fnam in f_names:
            fhandler = magic_open(fnam)
            try:
                while True:
                    if not chr_names:
                        header = fhandler.next()
                        if header.startswith('>'):
                            header = header[1:].split()[0]
                            genome_seq[header] = ''
                            break
                    else:
                        _ = fhandler.next()
                        header = chr_names.pop(0)
                        genome_seq[header] = ''
                        break
            except StopIteration:
                raise Exception('No crocodiles found, is it fasta?')
            genome_seq[header] = ''.join([l.rstrip()
                                          for l in fhandler]).upper()
    return genome_seq
示例#2
0
def parse_fasta(f_names, chr_names=None, verbose=True):
    """
    Parse a list of fasta files, or just one fasta.

    WARNING: The order is important

    :param f_names: list of pathes to files, or just a single path
    :param None chr_names: pass list of chromosome names, or just one. If None
       are passed, then chromosome names will be inferred from fasta headers

    :returns: a sorted dictionary with chromosome names as keys, and sequences
       as values (sequence in upper case)
    """
    if isinstance(f_names, str):
        f_names = [f_names]
    if isinstance(chr_names, str):
        chr_names = [chr_names]

    genome_seq = OrderedDict()
    if len(f_names) == 1:
        header = None
        seq = []
        for line in magic_open(f_names[0]):
            if line.startswith(">"):
                if header:
                    genome_seq[header] = "".join(seq).upper()
                if not chr_names:
                    header = line[1:].split()[0]
                    if verbose:
                        print "Parsing %s" % (header)
                else:
                    header = chr_names.pop(0)
                    if verbose:
                        print "Parsing %s as %s" % (line[1:].rstrip(), header)
                seq = []
                continue
            seq.append(line.rstrip())
        genome_seq[header] = "".join(seq).upper()
    else:
        for fnam in f_names:
            fhandler = magic_open(fnam)
            try:
                while True:
                    if not chr_names:
                        header = fhandler.next()
                        if header.startswith(">"):
                            header = header[1:].split()[0]
                            genome_seq[header] = ""
                            break
                    else:
                        _ = fhandler.next()
                        header = chr_names.pop(0)
                        genome_seq[header] = ""
                        break
            except StopIteration:
                raise Exception("No crocodiles found, is it fasta?")
            genome_seq[header] = "".join([l.rstrip() for l in fhandler]).upper()
    return genome_seq
def _gem_filter(fnam, unmap_out, map_out):
    """
    Divides reads in a map file in two categories: uniquely mapped, and not.
    Writes them in two files
    
    Notes:
       - GEM unique-maps can not be used as it gets rid of reads like 1:0:0:5
       - not feasible with gt.filter
    """
    fhandler = magic_open(fnam) if isinstance(fnam, str) else fnam
    unmap_out = open(unmap_out, 'w')
    map_out = open(map_out, 'w')
    for line in fhandler:
        matches = line.rsplit('\t', 2)[1]
        bad = False
        if matches != '1':
            for m in matches.replace('+', ':').split(':'):
                if m == '0':
                    continue
                if  m != '1':
                    bad = True
                    unmap_out.write(line)
                    break
                break
            else:
                bad = True
                unmap_out.write(line)
        if not bad:
            map_out.write(line)
    unmap_out.close()
示例#4
0
def get_mapped_chunk(map_folder, nreads):
    seqs = {}
    printime(' - loading chunk')
    pos_file = 0
    for fname in os.listdir(map_folder):
        printime('    - ' + fname)
        fhandler = magic_open(os.path.join(map_folder, fname))
        for line in fhandler:
            pos_file += 1
            rid, seq, qal, _, pos = line.split()
            pos = int(pos.split(':')[2])
            rid = rid.split('~')[0]
            seqs[rid, pos] = (seq, qal)
            if pos_file >= nreads:
                yield seqs
                printime(' - loading chunk')
                seqs = {}
                pos_file = 0
    yield seqs
示例#5
0
def _gem_filter(fnam, unmap_out, map_out):
    """
    Divides reads in a map file in two categories: uniquely mapped, and not.
    Writes them in two files

    Notes:
       - GEM unique-maps can not be used as it gets rid of reads like 1:0:0:5
       - not feasible with gt.filter
    """
    fhandler = magic_open(fnam) if isinstance(fnam, basestring) else fnam
    unmap_out = open(unmap_out, 'w')
    map_out = open(map_out, 'w')

    def _strip_read_name(line):
        """
        remove original sequence from read name when read is mapped uniquely
        """
        header, line = line.split('\t', 1)
        return '\t'.join((header.rsplit(' ', 2)[0], line))

    for line in fhandler:
        matches = line.rsplit('\t', 2)[1]
        bad = False
        if matches != '1':
            for m in matches.replace('+', ':').split(':'):
                if m == '0':
                    continue
                if m != '1':
                    bad = True
                    unmap_out.write(line)
                    break
                break
            else:
                bad = True
                unmap_out.write(line)
        if not bad:
            map_out.write(_strip_read_name(line))
    unmap_out.close()
    map_out.close()
示例#6
0
def _gem_filter(fnam, unmap_out, map_out):
    """
    Divides reads in a map file in two categories: uniquely mapped, and not.
    Writes them in two files

    Notes:
       - GEM unique-maps can not be used as it gets rid of reads like 1:0:0:5
       - not feasible with gt.filter
    """
    fhandler = magic_open(fnam) if isinstance(fnam, str) else fnam
    unmap_out = open(unmap_out, 'w')
    map_out   = open(map_out  , 'w')
    def _strip_read_name(line):
        """
        remove original sequence from read name when read is mapped uniquely
        """
        header, line = line.split('\t', 1)
        return '\t'.join((header.rsplit(' ', 2)[0], line))
    for line in fhandler:
        matches = line.rsplit('\t', 2)[1]
        bad = False
        if matches != '1':
            for m in matches.replace('+', ':').split(':'):
                if m == '0':
                    continue
                if  m != '1':
                    bad = True
                    unmap_out.write(line)
                    break
                break
            else:
                bad = True
                unmap_out.write(line)
        if not bad:
            map_out.write(_strip_read_name(line))
    unmap_out.close()
    map_out.close()
示例#7
0
def get_intersection(fname1, fname2, out_path, verbose=False):
    """
    Merges the two files corresponding to each reads sides. Reads found in both
       files are merged and written in an output file.

    Dealing with multiple contacts:
       - a pairwise contact is created for each possible combnation of the
         multicontacts. The name of the read is extended by '# 1/3' in case
         the reported pairwise contact corresponds to the first of 3 possibles
       - it may happen that different contacts are mapped on a single RE fragment
         (if each are on different end), in which case:
          - if no other fragment from this read are mapped than, both are kept
          - otherwise, they are merged into one longer (as if they were mapped
            in the positive strand)

    :param fname1: path to a tab separated file generated by the function
       :func:`pytadbit.parsers.sam_parser.parse_sam`
    :param fname2: path to a tab separated file generated by the function
       :func:`pytadbit.parsers.sam_parser.parse_sam`
    :param out_path: path to an outfile. It will written in a similar format as
       the inputs

    :returns: final number of pair of interacting fragments, and a dictionary with
       the number of multiple contacts (keys of the dictionary being the number of
       fragment cought together, can be 3, 4, 5..)
    """

    # Get the headers of the two files
    reads1 = magic_open(fname1)
    line1 = reads1.next()
    header1 = ''
    while line1.startswith('#'):
        if line1.startswith('# CRM'):
            header1 += line1
        line1 = reads1.next()
    read1 = line1.split('\t', 1)[0]

    reads2 = magic_open(fname2)
    line2 = reads2.next()
    header2 = ''
    while line2.startswith('#'):
        if line2.startswith('# CRM'):
            header2 += line2
        line2 = reads2.next()
    read2 = line2.split('\t', 1)[0]
    if header1 != header2:
        raise Exception('seems to be mapped onover different chromosomes\n')

    # prepare to write read pairs into different files
    # depending on genomic position
    nchunks = 1024
    global CHROM_START
    CHROM_START = {}
    cum_pos = 0
    for line in header1.split('\n'):
        if line.startswith('# CRM'):
            _, _, crm, pos = line.split()
            CHROM_START[crm] = cum_pos
            cum_pos += int(pos)
    lchunk = cum_pos / nchunks
    buf = dict([(i, []) for i in xrange(nchunks + 1)])
    # prepare temporary directories
    tmp_dir = out_path + '_tmp_files'
    mkdir(tmp_dir)
    for i in xrange(nchunks / int(nchunks**0.5) + 1):
        mkdir(path.join(tmp_dir, 'rep_%03d' % i))

    # iterate over reads in each of the two input files
    # and store them into a dictionary and then into temporary files
    # dicitonary ois emptied each 1 milion entries
    if verbose:
        print ('Getting intersection of reads 1 and reads 2:')
    count = 0
    count_dots = -1
    multiples = {}
    try:
        while True:
            if verbose:
                if not count_dots % 10:
                    stdout.write(' ')
                if not count_dots % 50:
                    stdout.write('%s\n  ' % (
                        ('  %4d milion reads' % (count_dots)) if
                        count_dots else ''))
                if count_dots >= 0:
                    stdout.write('.')
                    stdout.flush()
                count_dots += 1
            for _ in xrange(1000000): # iterate 1 million times, write to files
                # same read id in both lianes, we store put the more upstream
                # before and store them
                if eq_reads(read1, read2):
                    count += 1
                    _process_lines(line1, line2, buf, multiples, lchunk)
                    line1 = reads1.next()
                    read1 = line1.split('\t', 1)[0]
                    line2 = reads2.next()
                    read2 = line2.split('\t', 1)[0]
                # if first element of line1 is greater than the one of line2:
                elif gt_reads(read1, read2):
                    line2 = reads2.next()
                    read2 = line2.split('\t', 1)[0]
                else:
                    line1 = reads1.next()
                    read1 = line1.split('\t', 1)[0]
            write_to_files(buf, tmp_dir, nchunks)
    except StopIteration:
        reads1.close()
        reads2.close()
    write_to_files(buf, tmp_dir, nchunks)
    if verbose:
        print '\nFound %d pair of reads mapping uniquely' % count

    # sort each tmp file according to first element (idx) and write them
    # to output file (without the idx)
    # sort also according to read 2 (to filter duplicates)
    #      and also according to strand
    if verbose:
        print 'Sorting each temporary file by genomic coordinate'

    out = open(out_path, 'w')
    out.write(header1)
    for b in buf:
        if verbose:
            stdout.write('\r    %4d/%d sorted files' % (b + 1, len(buf)))
            stdout.flush()
        out.write(''.join(['\t'.join(l[1:]) for l in sorted(
            [l.split('\t') for l in open(
                path.join(tmp_dir, 'rep_%03d' % (b / int(nchunks**0.5)),
                          'tmp_%05d.tsv' % b))],
            key=lambda x: (x[0], x[8], x[9], x[6]))]))
    out.close()

    if verbose:
        print '\nRemoving temporary files...'
    system('rm -rf ' + tmp_dir)
    return count, multiples
示例#8
0
def transform_fastq(fastq_path, out_fastq, trim=None, r_enz=None, add_site=True,
                    min_seq_len=15, fastq=True, verbose=True,
                    light_storage=False, **kwargs):
    """
    Given a FASTQ file it can split it into chunks of a given number of reads,
    trim each read according to a start/end positions or split them into
    restriction enzyme fragments

    :param True add_site: when splitting the sequence by ligated sites found,
       removes the ligation site, and put back the original RE site.

    """
    skip = kwargs.get('skip', False)
    ## define local functions to process reads and sequences
    def _get_fastq_read_heavy(rlines):
        """
        returns header and sequence of 1 FASTQ entry
        Note: header also contains the sequence
        """
        rlines = rlines.rstrip('\n').split()[0][1:]
        seq = fhandler.next()
        _   = fhandler.next()  # lose qualities header but not needed
        qal = fhandler.next()  # lose qualities but not needed
        # header now also contains original read
        return (rlines + ' ' + seq.strip() + ' ' + qal.strip(),
                seq.strip(), qal.strip())

    def _get_fastq_read_light(rlines):
        """
        returns header and sequence of 1 FASTQ entry
        Note: header also contains the sequence
        """
        rlines = rlines.rstrip('\n').split()[0][1:]
        seq = fhandler.next()
        _   = fhandler.next()  # lose qualities header but not needed
        qal = fhandler.next()  # lose qualities but not needed
        return (rlines, seq.strip(), qal.strip())

    def _get_map_read_heavy(line):
        header = line.split('\t', 1)[0]
        seq, qal    = header.rsplit(' ', 2)[-2:]
        return header, seq, qal

    def _get_map_read_light(line):
        header, seq, qal, _ = line.split('\t', 3)
        return header, seq, qal

    def _inverse_find(seq, pat):
        try:
            return pat.search(seq).start()
        except AttributeError:
            return 'nan'

    def find_patterns(seq, patterns):
        pos, pat = min((_inverse_find(seq, patterns[p]), p) for p in patterns)
        return int(pos), pat

    def _split_read_re(seq, qal, patterns, site, max_seq_len=None, cnt=0):
        """
        Recursive generator that splits reads according to the
        predefined restriction enzyme.
        RE fragments yielded are followed and preceded by the RE site if a
        ligation site was found after the fragment.

        EXAMPLE:

           seq = '-------oGATCo========oGATCGATCo_____________oGATCGATCo~~~~~~~~~~~~'
           qal = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'

        should yield these fragments:

            -------oGATCo========oGATC
            xxxxxxxxxxxxxxxxxxxxxxHHHH

            GATCo_____________oGATC
            HHHHxxxxxxxxxxxxxxxHHHH

            GATCo~~~~~~~~~~~~
            HHHHxxxxxxxxxxxxx

        :param seq: sequence of the read fragment
        :param qal: quality of the sequence of the read fragment
        :param patterns: list of patterns of the ligated cut sites
        :param None max_seq_len: to control that all reads are bellow this
           length
        :param '' site: non-ligated cut site to replace ligation site
        :param 0 cnt: to count number of fragments

        :yields: seq fragments, their qualities and their count, or index
           (higher than 0 if ligation sites are found)
        """
        cnt += 1
        try:
            pos, (r_enz1, r_enz2) = find_patterns(seq, patterns)
        except ValueError:
            if len(seq) == max_seq_len:
                raise ValueError
            if len(seq) > min_seq_len:
                yield seq, qal, cnt
            return
        # add quality before corresponding to the space occupied by the cut-site
        xqal1 = ('H' * len(site[r_enz1]))
        xqal2 = ('H' * len(site[r_enz2]))
        if pos < min_seq_len:
            split_read(site[r_enz2] + seq[pos + len_relgs[(r_enz1, r_enz2)]:],
                       xqal2        + qal[pos + len_relgs[(r_enz1, r_enz2)]:],
                       patterns, no_site, max_seq_len, cnt=cnt)
        else:
            yield seq[:pos] + site[r_enz1], qal[:pos] + xqal1, cnt
        new_pos = pos + len_relgs[(r_enz1, r_enz2)]
        for sseq, sqal, cnt in split_read(site[r_enz2] + seq[new_pos:],
                                          xqal2 + qal[new_pos:], patterns,
                                          site, max_seq_len, cnt=cnt):
            yield sseq, sqal, cnt

    # Define function for stripping lines according to focus
    if isinstance(trim, tuple):
        beg, end = trim
        beg -= 1
        strip_line = lambda x: x[beg:end]
    else:
        strip_line = lambda x: x

    # define function to split reads according to restriction enzyme sites
    if isinstance(r_enz, str):
        r_enzs = [r_enz]
    elif isinstance(r_enz, list):
        r_enzs = r_enz
    else:
        r_enzs = None

    if r_enzs:
        enzymes = {}
        enz_patterns = {}
        for r_enz in r_enzs:
            enzymes[r_enz] = RESTRICTION_ENZYMES[r_enz].replace('|', '')
        enz_patterns = religateds(r_enzs)
        sub_enz_patterns = {}
        len_relgs = {}
        for r_enz1, r_enz2 in enz_patterns:
            sub_enz_patterns[(r_enz1, r_enz2)] = (
                enz_patterns[(r_enz1, r_enz2)][:len(enz_patterns[(r_enz1, r_enz2)])
                                               / 2])
            len_relgs[(r_enz1, r_enz2)] = len(enz_patterns[(r_enz1, r_enz2)])
        print '  - splitting into restriction enzyme (RE) fragments using ligation sites'
        print '  - ligation sites are replaced by RE sites to match the reference genome'
        for r_enz1 in r_enzs:
            for r_enz2 in r_enzs:
                print '    * enzymes: %s & %s, ligation site: %s, RE site: %s & %s' % (
                    r_enz1, r_enz2, enz_patterns[(r_enz1, r_enz2)],
                    enzymes[r_enz1], enzymes[r_enz2])
        # replace pattern with regex to support IUPAC annotation
        for ezp in enz_patterns:
            enz_patterns[ezp] = re.compile(iupac2regex(enz_patterns[ezp]))
        for ezp in sub_enz_patterns:
            sub_enz_patterns[ezp] = iupac2regex(sub_enz_patterns[ezp])
        split_read = _split_read_re
    else:
        enzymes = ''
        enz_patterns = ''
        sub_enz_patterns = ''
        split_read = lambda x, y, z, after_z, after_after_z: (yield x, y , 1)

    # function to yield reads from input file
    if light_storage:
        get_seq = _get_fastq_read_light if fastq else _get_map_read_light
        insert_mark = insert_mark_light
    else:
        get_seq = _get_fastq_read_heavy if fastq else _get_map_read_heavy
        insert_mark = insert_mark_heavy

    ## Start processing the input file
    if verbose:
        print 'Preparing %s file' % ('FASTQ' if fastq else 'MAP')
        if fastq:
            print '  - conversion to MAP format'
        if trim:
            print '  - trimming reads %d-%d' % tuple(trim)
    counter = 0
    if skip:
        if fastq:
            print '    ... skipping, only counting lines'
            counter = sum(1 for _ in magic_open(fastq_path,
                                                cpus=kwargs.get('nthreads')))
            counter /= 4 if fastq else 1
            print '            ' + fastq_path, counter, fastq
        return out_fastq, counter
    # open input file
    fhandler = magic_open(fastq_path, cpus=kwargs.get('nthreads'))
    # create output file
    out_name = out_fastq
    out = open(out_fastq, 'w')
    # iterate over reads and strip them
    no_site = dict([(r_enz, '') for r_enz in enzymes])
    site = enzymes if add_site else no_site
    for header in fhandler:
        header, seq, qal = get_seq(header)
        counter += 1
        # trim on wanted region of the read
        seq = strip_line(seq)
        qal = strip_line(qal)
        # get the generator of restriction enzyme fragments
        iter_frags = split_read(seq, qal, enz_patterns, site, len(seq))
        # the first fragment should not be preceded by the RE site
        try:
            seq, qal, cnt = iter_frags.next()
        except StopIteration:
            # read full of ligation events, fragments not reaching minimum
            continue
        except ValueError:
            # or not ligation site found, in which case we try with half
            # ligation site in case there was a sequencing error (half ligation
            # site is a RE site or nearly, and thus should not be found anyway)
            iter_frags = split_read(seq, qal, sub_enz_patterns, no_site, len(seq))
            try:
                seq, qal, cnt = iter_frags.next()
            except ValueError:
                continue
            except StopIteration:
                continue
        out.write(_map2fastq('\t'.join((insert_mark(header, cnt),
                                        seq, qal, '0', '-\n'))))
        # the next fragments should be preceded by the RE site
        # continue
        for seq, qal, cnt in  iter_frags:
            out.write(_map2fastq('\t'.join((insert_mark(header, cnt),
                                            seq, qal, '0', '-\n'))))
    out.close()
    return out_name, counter
示例#9
0
def parse_bed(fnam, resolution=1):
    """
    simple BED and BEDgraph parser that only checks for the fields 1, 2, 3 and 5
       (or 1, 2 and 3 if 5 not availbale).

    .. note::

        2 or 3 columns files can also be passed and will be interpreted,
        respectively, as chromosome/begin and chromosome/begin/end


    :param fnam: path to BED file
    :param 1 resolution: to bin the resulting dictionary

    :returns: a dictionnary with a count of number of entries found per bin. In
       case column 5 is present the values used tyo weight entries, otherwise
       each entry will weight 1.

    """

    fhandler = magic_open(fnam)
    line = fhandler.next()
    fpos = len(line)
    while (line.startswith('#')     or
           line.startswith('track') or
           line.startswith('browser')):
        fpos += len(line)
        line = fhandler.next()
    ##################
    # check file type
    try:
        # classic BED
        _, _, _, _, val, _ =  line.split('\t', 5)
        try:
            float(val)
            parse_line = _bed_float
        except ValueError:
            parse_line = _bed_one
    except ValueError:
        try:
            # BEDgraph
            _, _, _, val =  line.split('\t', 5)
            parse_line = _bedgraph_float
        except ValueError:
            try:
                # BEDgraph with no values
                _, _, _ =  line.split()
                parse_line = _3_col
            except ValueError:
                # only chromosome and begin position available
                parse_line = _2_col

    ####################################
    # go back to first informative line
    # parse
    dico = {}
    fhandler.seek(fpos)
    for line in fhandler:
        crm, beg, end, val = parse_line(line)
        pos = (beg + end - beg) / resolution
        dico.setdefault(crm, {})
        dico[crm].setdefault(pos, 0)
        dico[crm][pos] += val

    return dico
示例#10
0
def parse_map(f_names1,
              f_names2=None,
              out_file1=None,
              out_file2=None,
              genome_seq=None,
              re_name=None,
              verbose=False,
              **kwargs):
    """
    Parse map files

    Keep a summary of the results into 2 tab-separated files that will contain 6
       columns: read ID, Chromosome, position, strand (either 0 or 1), mapped
       sequence lebgth, position of the closest upstream RE site, position of
       the closest downstream RE site

    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read1, can also  be just one file
    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read2, can also  be just one file
    :param out_file1: path to outfile tab separated format containing mapped
       read1 information
    :param out_file1: path to outfile tab separated format containing mapped
       read2 information
    :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`.
       containing the genomic sequence
    :param re_name: name of the restriction enzyme used
    """
    # not nice, dirty fix in order to allow this function to only parse
    # one SAM file
    if not out_file1:
        raise Exception('ERROR: out_file1 should be given\n')
    if not re_name:
        raise Exception('ERROR: re_name should be given\n')
    if not genome_seq:
        raise Exception('ERROR: genome_seq should be given\n')
    if (f_names2 and not out_file2) or (not f_names2 and out_file2):
        raise Exception('ERROR: out_file2 AND f_names2 needed\n')

    frag_chunk = kwargs.get('frag_chunk', 100000)
    if verbose:
        print 'Searching and mapping RE sites to the reference genome'
    frags = map_re_sites(re_name,
                         genome_seq,
                         frag_chunk=frag_chunk,
                         verbose=verbose)

    if isinstance(f_names1, str):
        f_names1 = [f_names1]
    if isinstance(f_names2, str):
        f_names2 = [f_names2]
    if f_names2:
        fnames = f_names1, f_names2
        outfiles = out_file1, out_file2
    else:
        fnames = (f_names1, )
        outfiles = (out_file1, )

    for read in range(len(fnames)):
        if verbose:
            print 'Loading read' + str(read + 1)
        windows = {}
        tmp_name = '/'.join(outfiles[read].split('/')
                            [:-1]) + '/tmp_' + outfiles[read].split('/')[-1]
        tmp_reads_fh = open(tmp_name, 'w')
        sorter = Popen(['sort', '-k', '1,1', '-s', '-t', '\t'],
                       stdin=PIPE,
                       stdout=tmp_reads_fh)
        num = 0
        for fnam in fnames[read]:
            try:
                fhandler = magic_open(fnam)
            except IOError:
                warn('WARNING: file "%s" not found\n' % fnam)
                continue
            # get the iteration number of the iterative mapping
            try:
                num = int(fnam.split('.')[-1].split(':')[0])
            except:
                num += 1
            windows.setdefault(num, 0)
            if verbose:
                print 'loading file: %s' % (fnam)
            # iteration over reads
            for r in fhandler:
                name, seq, _, _, ali = r.split('\t')[:5]
                crm, strand, pos = ali.split(':')[:3]
                positive = strand == '+'
                len_seq = len(seq)
                if positive:
                    pos = int(pos)
                else:
                    pos = int(
                        pos) + len_seq - 1  # remove 1 because all inclusive
                try:
                    frag_piece = frags[crm][pos / frag_chunk]
                except KeyError:
                    # Chromosome not in hash
                    continue
                idx = bisect(frag_piece, pos)
                try:
                    next_re = frag_piece[idx]
                except IndexError:
                    # case where part of the read is mapped outside chromosome
                    count = 0
                    while idx >= len(frag_piece) and count < len_seq:
                        pos -= 1
                        count += 1
                        frag_piece = frags[crm][pos / frag_chunk]
                        idx = bisect(frag_piece, pos)
                    if count >= len_seq:
                        raise Exception('Read mapped mostly outside ' +
                                        'chromosome\n')
                    next_re = frag_piece[idx]
                prev_re = frag_piece[idx - 1 if idx else 0]
                sorter.stdin.write(
                    '%s\t%s\t%d\t%d\t%d\t%d\t%d\n' %
                    (name, crm, pos, positive, len_seq, prev_re, next_re))
                windows[num] += 1

        if verbose:
            print 'finishing to sort'
        sorter.communicate()
        tmp_reads_fh.close()

        if verbose:
            print 'Getting Multiple contacts'
        reads_fh = open(outfiles[read], 'w')
        ## Also pipe file header
        # chromosome sizes (in order)
        reads_fh.write('# Chromosome lengths (order matters):\n')
        for crm in genome_seq:
            reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm])))
        reads_fh.write('# Mapped\treads count by iteration\n')
        for size in windows:
            reads_fh.write('# MAPPED %d %d\n' % (size, windows[size]))

        ## Multicontacts
        tmp_reads_fh = open(tmp_name)
        read = tmp_reads_fh.next()
        prev_head = read.split('\t', 1)[0]
        prev_read = read.strip()
        for read in tmp_reads_fh:
            head = read.split('\t', 1)[0]
            if head == prev_head:
                prev_read += '|||' + read.strip()
            else:
                reads_fh.write(prev_read + '\n')
                prev_read = read.strip()
            prev_head = head
        reads_fh.write(prev_read + '\n')
        reads_fh.close()
示例#11
0
def get_intersection(fname1, fname2, out_path, verbose=False, compress=False):
    """
    Merges the two files corresponding to each reads sides. Reads found in both
       files are merged and written in an output file.

    Dealing with multiple contacts:
       - a pairwise contact is created for each possible combnation of the
         multicontacts. The name of the read is extended by '# 1/3' in case
         the reported pairwise contact corresponds to the first of 3 possibles
       - it may happen that different contacts are mapped on a single RE fragment
         (if each are on different end), in which case:
          - if no other fragment from this read are mapped than, both are kept
          - otherwise, they are merged into one longer (as if they were mapped
            in the positive strand)

    :param fname1: path to a tab separated file generated by the function
       :func:`pytadbit.parsers.sam_parser.parse_sam`
    :param fname2: path to a tab separated file generated by the function
       :func:`pytadbit.parsers.sam_parser.parse_sam`
    :param out_path: path to an outfile. It will written in a similar format as
       the inputs
    :param False compress: compress (gzip) input files. This is done in the
       background while next input files are parsed.

    :returns: final number of pair of interacting fragments, and a dictionary with
       the number of multiple contacts (keys of the dictionary being the number of
       fragment cought together, can be 3, 4, 5..)
    """

    # Get the headers of the two files
    reads1 = magic_open(fname1)
    line1 = next(reads1)
    header1 = ''
    while line1.startswith('#'):
        if line1.startswith('# CRM'):
            header1 += line1
        line1 = next(reads1)
    read1 = line1.split('\t', 1)[0]

    reads2 = magic_open(fname2)
    line2 = next(reads2)
    header2 = ''
    while line2.startswith('#'):
        if line2.startswith('# CRM'):
            header2 += line2
        line2 = next(reads2)
    read2 = line2.split('\t', 1)[0]
    if header1 != header2:
        raise Exception('seems to be mapped onover different chromosomes\n')

    # prepare to write read pairs into different files
    # depending on genomic position
    nchunks = 1024
    global CHROM_START
    CHROM_START = {}
    cum_pos = 0
    for line in header1.split('\n'):
        if line.startswith('# CRM'):
            _, _, crm, pos = line.split()
            CHROM_START[crm] = cum_pos
            cum_pos += int(pos)
    lchunk = cum_pos // nchunks
    buf = dict([(i, []) for i in range(nchunks + 1)])
    # prepare temporary directories
    tmp_dir = out_path + '_tmp_files'
    mkdir(tmp_dir)
    for i in range(nchunks // int(nchunks**0.5) + 1):
        mkdir(path.join(tmp_dir, 'rep_%03d' % i))

    # iterate over reads in each of the two input files
    # and store them into a dictionary and then into temporary files
    # dicitonary ois emptied each 1 milion entries
    if verbose:
        print ('Getting intersection of reads 1 and reads 2:')
    count = 0
    count_dots = -1
    multiples = {}
    try:
        while True:
            if verbose:
                if not count_dots % 10:
                    stdout.write(' ')
                if not count_dots % 50:
                    stdout.write('%s\n  ' % (
                        ('  %4d milion reads' % (count_dots)) if
                        count_dots else ''))
                if count_dots >= 0:
                    stdout.write('.')
                    stdout.flush()
                count_dots += 1
            for _ in range(1000000): # iterate 1 million times, write to files
                # same read id in both lianes, we store put the more upstream
                # before and store them
                if eq_reads(read1, read2):
                    count += 1
                    _process_lines(line1, line2, buf, multiples, lchunk)
                    line1 = next(reads1)
                    read1 = line1.split('\t', 1)[0]
                    line2 = next(reads2)
                    read2 = line2.split('\t', 1)[0]
                # if first element of line1 is greater than the one of line2:
                elif gt_reads(read1, read2):
                    line2 = next(reads2)
                    read2 = line2.split('\t', 1)[0]
                else:
                    line1 = next(reads1)
                    read1 = line1.split('\t', 1)[0]
            write_to_files(buf, tmp_dir, nchunks)
    except StopIteration:
        reads1.close()
        reads2.close()
    write_to_files(buf, tmp_dir, nchunks)
    if verbose:
        print('\nFound %d pair of reads mapping uniquely' % count)

    # compression
    if compress:
        if verbose:
            print('compressing input files')
        procs = [Popen(['gzip', f]) for f in (fname1, fname2)]
    # sort each tmp file according to first element (idx) and write them
    # to output file (without the idx)
    # sort also according to read 2 (to filter duplicates)
    #      and also according to strand
    if verbose:
        print('Sorting each temporary file by genomic coordinate')

    out = open(out_path, 'w')
    out.write(header1)
    for b in buf:
        if verbose:
            stdout.write('\r    %4d/%d sorted files' % (b + 1, len(buf)))
            stdout.flush()
        with open(path.join(tmp_dir, 'rep_%03d' % (b // int(nchunks**0.5)),
                            'tmp_%05d.tsv' % b)) as f_tmp:
            out.write(''.join(['\t'.join(l[1:]) for l in sorted(
                [l.split('\t') for l in f_tmp],
                key=lambda x: (x[0], x[8], x[9], x[6]))]))
    out.close()

    if compress:
        for proc in procs:
            proc.communicate()
        system('rm -rf ' + fname1)
        system('rm -rf ' + fname2)
    if verbose:
        print('\nRemoving temporary files...')
    system('rm -rf ' + tmp_dir)
    return count, multiples
示例#12
0
def quality_plot(fnam,
                 r_enz=None,
                 nreads=float('inf'),
                 axe=None,
                 savefig=None,
                 paired=False):
    """
    Plots the sequencing quality of a given FASTQ file. If a restrinction enzyme
    (RE) name is provided, can also represent the distribution of digested and
    undigested RE sites and estimate an expected proportion of dangling-ends.

    Proportion of dangling-ends is inferred by counting the number of times a
    dangling-end site, is found at the beginning of any of the reads (divided by
    the number of reads).

    :param fnam: path to FASTQ file
    :param None nreads: max number of reads to read, not necesary to read all
    :param None savefig: path to a file where to save the image generated;
       if None, the image will be shown using matplotlib GUI (the extension
       of the file name will determine the desired format).
    :param False paired: is input FASTQ contains both ends

    :returns: the percentage of dangling-ends (sensu stricto) and the percentage of
       reads with at least a ligation site.
    """
    phred = dict([(c, i) for i, c in enumerate(
        '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'
    )])
    if isinstance(r_enz, list):
        r_enzs = r_enz
    elif isinstance(r_enz, basestring):
        r_enzs = [r_enz]
    for k in list(RESTRICTION_ENZYMES.keys()):
        for i in range(len(r_enzs)):
            if k.lower() == str(r_enz[i]).lower():
                r_enz[i] = k
    # else let it as None

    quals = []
    henes = []
    sites = {}
    fixes = {}
    liges = OrderedDict()
    ligep = {}
    tkw = dict(size=4, width=1.5)
    fhandler = magic_open(fnam)
    if len(r_enzs) == 1 and r_enzs[0] is None:
        if nreads:
            while True:
                try:
                    next(fhandler)
                except EOFError:
                    break
                seq = next(fhandler)
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
                if len(quals) > nreads:
                    break
        else:  # do this because it's faster
            while True:
                try:
                    next(fhandler)
                except EOFError:
                    break
                seq = next(fhandler)
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
    else:
        r_sites = {}
        d_sites = {}
        for r_enz in r_enzs:
            r_sites[r_enz] = RESTRICTION_ENZYMES[r_enz].replace('|', '')
            d_sites[r_enz] = repaired(r_enz)
            sites[r_enz] = []  # initialize dico to store undigested sites
            fixes[r_enz] = []  # initialize dico to store digested sites
        l_sites = religateds(r_enzs)
        l_sites = OrderedDict((k, iupac2regex(l_sites[k])) for k in l_sites)
        site = {}
        fixe = {}
        for r_enz in r_enzs:
            site[r_enz] = re.compile(iupac2regex(r_sites[r_enz]))
            fixe[r_enz] = re.compile(iupac2regex(d_sites[r_enz]))
        # ligation sites should appear in lower case in the sequence
        lige = {}
        for k in l_sites:
            liges[k] = []  # initialize dico to store sites
            ligep[k] = 0  # initialize dico to store sites
            l_sites[k] = l_sites[k].lower()
            lige[k] = re.compile(l_sites[k])
        callback = lambda pat: pat.group(0).lower()
        while len(quals) <= nreads:
            try:
                next(fhandler)
            except StopIteration:
                break
            seq = next(fhandler)
            # ligation sites replaced by lower case to ease the search
            for lig in list(l_sites.values()):
                seq = re.sub(lig.upper(), callback, seq)
            for r_enz in r_enzs:
                sites[r_enz].extend(
                    [m.start() for m in site[r_enz].finditer(seq)])
                # TODO: you cannot have a repaired/fixed site in the middle of
                # the sequence, this could be only checked at the beginning
                fixes[r_enz].extend(
                    [m.start() for m in fixe[r_enz].finditer(seq)])
            for k in lige:  # for each paired of cut-site
                liges[k].extend([m.start() for m in lige[k].finditer(seq)])
                if lige[k].search(seq):
                    ligep[k] += 1
            # store the number of Ns found in the sequences
            if 'N' in seq:
                henes.extend([i for i, s in enumerate(seq) if s == 'N'])
            next(fhandler)
            line = next(fhandler)
            quals.append([phred[i] for i in line.strip()])
    fhandler.close()
    if not nreads:
        nreads = len(quals)
    quals = zip_longest(*quals, fillvalue=float('nan'))
    meanquals, errorquals = list(zip(*[(nanmean(q), nanstd(q))
                                       for q in quals]))
    max_seq_len = len(meanquals)

    if axe:
        ax = axe
        fig = axe.get_figure()
        ax2 = fig.add_subplot(212)
    else:  # configure plot
        if len(r_enzs) == 1 and r_enzs[0] is None:  # do both plots
            _, ax = plt.subplots(1, 1, figsize=(15, 6))
        else:  # only do the quality_plot plot
            _, (ax, ax2) = plt.subplots(2, 1, figsize=(15, 12))
        ax.patch.set_facecolor('lightgrey')
        ax.patch.set_alpha(0.4)
        ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
        ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
        ax.set_axisbelow(True)
        # remove tick marks
        ax.tick_params(axis='both',
                       direction='out',
                       top=False,
                       right=False,
                       left=False,
                       bottom=False)
        ax.tick_params(axis='both',
                       direction='out',
                       top=False,
                       right=False,
                       left=False,
                       bottom=False,
                       which='minor')

    ax.errorbar(list(range(max_seq_len)),
                meanquals,
                linewidth=1,
                elinewidth=1,
                color='darkblue',
                yerr=errorquals,
                ecolor='orange')

    ax.set_xlim((0, max_seq_len))
    ax.set_xlabel('Nucleotidic position')
    ax.set_ylabel('PHRED score')
    ax.set_title('Sequencing Quality (%d reads)' % (nreads))
    ax.yaxis.label.set_color('darkblue')
    ax.tick_params(axis='y', colors='darkblue', **tkw)
    axb = ax.twinx()
    # quality_plot plot
    axb.plot([henes.count(i) for i in range(max_seq_len)],
             linewidth=1,
             color='black',
             linestyle='--')
    axb.yaxis.label.set_color('black')
    axb.tick_params(axis='y', colors='black', **tkw)
    axb.set_ylabel('Number of "N" per position')
    try:  # no Ns found (yes... it happens)
        axb.set_yscale('log')
        with catch_warnings():
            simplefilter("ignore")
            axb.set_ylim((0, axb.get_ylim()[1] * 1000))
    except ValueError:
        axb.set_yscale('linear')
    ax.set_ylim((0, ax.get_ylim()[1]))
    ax.set_xlim((0, max_seq_len))

    # Hi-C plot
    if not (len(r_enzs) == 1 and r_enzs[0] is None):
        ax.set_title('Sequencing Quality and deconvolution (%s %d reads)' %
                     (', '.join(map(str, r_enzs)), nreads))
        ax.set_xlabel('')
        plt.setp(ax.get_xticklabels(), visible=False)
        ax2.patch.set_facecolor('lightgrey')
        ax2.patch.set_alpha(0.4)
        ax2.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
        ax2.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
        ax2.set_axisbelow(True)
        ax2.set_xlabel('Nucleotidic position')

        # seq_len is the length of the line to plot. we don't want to plot
        # if there is no room for the cut-site, or ligation site.
        site_len = max((max([len(r_sites[k]) for k in r_sites]),
                        max([len(l_sites[k]) for k in l_sites]),
                        max([len(d_sites[k]) for k in d_sites])))
        seq_len = max_seq_len - site_len

        # transform dictionaries of positions into dictionaries of counts
        for r_enz in sites:
            sites[r_enz] = [sites[r_enz].count(k)
                            for k in range(seq_len)]  # Undigested
            fixes[r_enz] = [fixes[r_enz].count(k)
                            for k in range(seq_len)]  # DE
        for r1, r2 in liges:
            liges[(r1,
                   r2)] = [liges[(r1, r2)].count(k)
                           for k in range(seq_len)]  # OK

        # in case the pattern of the repaired cut-site contains the target
        # cut-site pattern. These sites were counted twice, once in the
        # undigested, and once in the repaired. We remove them from the
        # repaired:
        for r_enz in r_enzs:
            if d_sites[r_enz] in r_sites[r_enz]:
                pos = r_sites[r_enz].find(d_sites[r_enz])

                fixes[r_enz] = (fixes[r_enz][:pos] + [
                    fixes[r_enz][k] - sites[r_enz][k - pos]
                    for k in range(pos, seq_len)
                ])
        # same for ligated sites
        for r_enz1 in r_enzs:
            for r_enz2 in r_enzs:
                if d_sites[r_enz1] not in l_sites[(r_enz1, r_enz2)]:
                    continue
                pos = l_sites[(r_enz1, r_enz2)].find(d_sites[r_enz1])
                fixes[r_enz1] = (fixes[r_enz1][:pos] + [
                    fixes[r_enz1][k] - liges[(r_enz1, r_enz2)][k - pos]
                    for k in range(pos, seq_len)
                ])

        # remove anything that could be in between the two read ends
        if paired:
            for k in sites:
                sites[k][max_seq_len // 2 - site_len:max_seq_len //
                         2] = [float('nan')] * site_len
                fixes[k][max_seq_len // 2 - site_len:max_seq_len //
                         2] = [float('nan')] * site_len
            for k in liges:
                liges[k][max_seq_len // 2 - site_len:max_seq_len //
                         2] = [float('nan')] * site_len

        # plot undigested cut-sites
        color = iter(plt.cm.Reds(linspace(0.3, 0.95, len(r_enzs))))
        for r_enz in sites:
            # print 'undigested', r_enz
            # print sites[r_enz][:20]
            ax2.plot(
                sites[r_enz],
                linewidth=2,
                color=next(color),
                alpha=0.9,
                label='Undigested RE site (%s: %s)' %
                (r_enz, r_sites[r_enz]) if any([f > 0 for f in fixes[r_enz]])
                else 'Undigested & Dangling-Ends (%s: %s)' %
                (r_enz, r_sites[r_enz]))
        ax2.set_ylabel('Undigested')
        ax2.yaxis.label.set_color('darkred')
        ax2.tick_params(axis='y', colors='darkred', **tkw)

        lines, labels = ax2.get_legend_handles_labels()

        ax3 = ax2.twinx()
        color = iter(plt.cm.Blues(linspace(0.3, 0.95, len(liges))))
        for r1, r2 in liges:
            # print 'ligated', r1, r2
            # print liges[(r1, r2)][:20]
            ax3.plot(liges[(r1, r2)],
                     linewidth=2,
                     color=next(color),
                     alpha=0.9,
                     label='Ligated (%s-%s: %s)' %
                     (r1, r2, l_sites[(r1, r2)].upper()))
        ax3.yaxis.label.set_color('darkblue')
        ax3.tick_params(axis='y', colors='darkblue', **tkw)
        ax3.set_ylabel('Ligated')

        tmp_lines, tmp_labels = ax3.get_legend_handles_labels()
        lines.extend(tmp_lines)
        labels.extend(tmp_labels)

        color = iter(plt.cm.Greens(linspace(0.3, 0.95, len(r_enzs))))
        for i, r_enz in enumerate(r_enzs):
            if any([f > 0 for f in fixes[r_enz]]):
                ax4 = ax2.twinx()
                ax4.spines["right"].set_position(("axes", 1.07))
                make_patch_spines_invisible(ax4)
                ax4.spines["right"].set_visible(True)
                # print 'repaired', r_enz
                # print fixes[r_enz][:20]
                ax4.plot(fixes[r_enz],
                         linewidth=2,
                         color=next(color),
                         alpha=0.9,
                         label='Dangling-ends (%s: %s)' %
                         (r_enz, d_sites[r_enz]))
                ax4.yaxis.label.set_color('darkgreen')
                ax4.tick_params(axis='y', colors='darkgreen', **tkw)
                ax4.set_ylabel('Dangling-ends')
                tmp_lines, tmp_labels = ax4.get_legend_handles_labels()
                lines.extend(tmp_lines)
                labels.extend(tmp_labels)
            else:
                ax2.set_ylabel('Undigested & Dangling-ends')
        ax2.set_xlim((0, max_seq_len))

        # Count ligation sites
        lig_cnt = {}
        for k in liges:
            lig_cnt[k] = (nansum(liges[k]) - liges[k][0] -
                          liges[k][max_seq_len // 2])

        # Count undigested sites
        sit_cnt = {}
        for r_enz in r_enzs:
            sit_cnt[r_enz] = (nansum(sites[r_enz]) - sites[r_enz][0] -
                              sites[r_enz][max_seq_len // 2])

        # Count Dangling-Ends
        des = {}
        for r_enz in r_enzs:
            if any([f > 0 for f in fixes[r_enz]]):
                des[r_enz] = (
                    (100. *
                     (fixes[r_enz][0] +
                      (fixes[r_enz][(max_seq_len // 2)] if paired else 0))) /
                    nreads)
            else:
                des[r_enz] = (100. * (sites[r_enz][0] + (sites[r_enz][
                    (max_seq_len // 2)] if paired else 0))) / nreads

        # Decorate plot
        title = ''
        for r_enz in r_enzs:
            lcnt = float(
                sum([
                    lig_cnt[(r_enz1, r_enz2)] * (2 if r_enz1 == r_enz2 else 1)
                    for r_enz1 in r_enzs for r_enz2 in r_enzs
                    if r_enz1 == r_enz or r_enz2 == r_enz
                ]))
            title += (
                'Percentage of digested sites (not considering Dangling-Ends) '
                '%s: %.1f%%\n' % (r_enz, 100. * float(lcnt) /
                                  (lcnt + sit_cnt[r_enz])))
        for r_enz in r_enzs:
            title += 'Percentage of dangling-ends %s: %.1f%%\n' % (r_enz,
                                                                   des[r_enz])

        for r_enz1 in r_enzs:
            for r_enz2 in r_enzs:
                title += (
                    'Percentage of reads with ligation site (%s-%s): %.1f%% \n'
                    % (r_enz1, r_enz2,
                       (ligep[(r_enz1, r_enz2)] * 100.) / nreads))
        plt.title(title.strip(), size=10, ha='left', x=0)
        plt.subplots_adjust(right=0.85)
        ax2.legend(lines,
                   labels,
                   bbox_to_anchor=(0.75, 1.0),
                   loc=3,
                   borderaxespad=0.,
                   frameon=False,
                   fontsize=9)
    plt.tight_layout()
    if savefig:
        tadbit_savefig(savefig)
        plt.close('all')
    elif not axe:
        plt.show()
    for k in ligep:
        ligep[k] = (ligep[k] * 100.) / nreads
    if len(r_enzs) == 1 and r_enzs[0] is None:
        return {}, {}
    return des, ligep
示例#13
0
def transform_fastq(fastq_path, out_fastq, trim=None, r_enz=None, add_site=True,
                    min_seq_len=15, fastq=True, verbose=True,
                    light_storage=False, **kwargs):
    """
    Given a FASTQ file it can split it into chunks of a given number of reads,
    trim each read according to a start/end positions or split them into
    restriction enzyme fragments

    :param True add_site: when splitting the sequence by ligated sites found,
       removes the ligation site, and put back the original RE site.

    """
    skip = kwargs.get('skip', False)
    ## define local functions to process reads and sequences
    def _get_fastq_read_heavy(rlines):
        """
        returns header and sequence of 1 FASTQ entry
        Note: header also contains the sequence
        """
        rlines = rlines.rstrip('\n').split()[0][1:]
        seq = fhandler.next()
        _   = fhandler.next()  # lose qualities header but not needed
        qal = fhandler.next()  # lose qualities but not needed
        # header now also contains original read
        return (rlines + ' ' + seq.strip() + ' ' + qal.strip(),
                seq.strip(), qal.strip())

    def _get_fastq_read_light(rlines):
        """
        returns header and sequence of 1 FASTQ entry
        Note: header also contains the sequence
        """
        rlines = rlines.rstrip('\n').split()[0][1:]
        seq = fhandler.next()
        _   = fhandler.next()  # lose qualities header but not needed
        qal = fhandler.next()  # lose qualities but not needed
        return (rlines, seq.strip(), qal.strip())

    def _get_map_read_heavy(line):
        header = line.split('\t', 1)[0]
        seq, qal    = header.rsplit(' ', 2)[-2:]
        return header, seq, qal

    def _get_map_read_light(line):
        header, seq, qal, _ = line.split('\t', 3)
        return header, seq, qal

    def _split_read_re(seq, qal, pattern, max_seq_len=None, site='', cnt=0):
        """
        Recursive generator that splits reads according to the
        predefined restriction enzyme.
        RE fragments yielded are followed and preceded by the RE site if a
        ligation site was found after the fragment.

        EXAMPLE:

           seq = '-------oGATCo========oGATCGATCo_____________oGATCGATCo~~~~~~~~~~~~'
           qal = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'

        should yield these fragments:

            -------oGATCo========oGATC
            xxxxxxxxxxxxxxxxxxxxxxHHHH

            GATCo_____________oGATC
            HHHHxxxxxxxxxxxxxxxHHHH

            GATCo~~~~~~~~~~~~
            HHHHxxxxxxxxxxxxx

        """
        cnt += 1
        try:
            pos = seq.index(pattern)
        except ValueError:
            if len(seq) == max_seq_len:
                raise ValueError
            if len(seq) > min_seq_len:
                yield seq, qal, cnt
            return
        xqal = ('H' * len(site))
        if pos < min_seq_len:
            split_read(site + seq[pos + len_relg:],
                       xqal + qal[pos + len_relg:],
                       pattern, max_seq_len, cnt=cnt)
        else:
            yield seq[:pos] + site, qal[:pos] + xqal, cnt
        new_pos = pos + len_relg
        for sseq, sqal, cnt in split_read(site + seq[new_pos:],
                                          xqal + qal[new_pos:], pattern,
                                          max_seq_len, site=site, cnt=cnt):
            yield sseq, sqal, cnt

    # Define function for stripping lines according to focus
    if isinstance(trim, tuple):
        beg, end = trim
        beg -= 1
        strip_line = lambda x: x[beg:end]
    else:
        strip_line = lambda x: x

    # define function to split reads according to restriction enzyme sites
    if isinstance(r_enz, str):
        enzyme = RESTRICTION_ENZYMES[r_enz].replace('|', '')
        enz_pattern = religated(r_enz)
        sub_enz_pattern = enz_pattern[:len(enz_pattern) / 2]
        len_relg = len(enz_pattern)
        print '  - splitting into restriction enzyme (RE) fragments using ligation sites'
        print '  - ligation sites are replaced by RE sites to match the reference genome'
        print '    * enzyme: %s, ligation site: %s, RE site: %s' % (r_enz, enz_pattern, enzyme)
        split_read = _split_read_re
    else:
        enzyme = ''
        enz_pattern = ''
        split_read = lambda x, y, z, after_z, after_after_z: (yield x, y , 1)

    # function to yield reads from input file
    if light_storage:
        get_seq = _get_fastq_read_light if fastq else _get_map_read_light
        insert_mark = insert_mark_light
    else:
        get_seq = _get_fastq_read_heavy if fastq else _get_map_read_heavy
        insert_mark = insert_mark_heavy

    ## Start processing the input file
    if verbose:
        print 'Preparing %s file' % ('FASTQ' if fastq else 'MAP')
        if fastq:
            print '  - conversion to MAP format'
        if trim:
            print '  - trimming reads %d-%d' % tuple(trim)
    counter = 0
    if skip:
        if fastq:
            print '    ... skipping, only counting lines'
            counter = sum(1 for _ in magic_open(fastq_path,
                                                cpus=kwargs.get('nthreads')))
            counter /= 4 if fastq else 1
            print '            ' + fastq_path, counter, fastq
        return out_fastq, counter
    # open input file
    fhandler = magic_open(fastq_path, cpus=kwargs.get('nthreads'))
    # create output file
    out_name = out_fastq
    out = open(out_fastq, 'w')
    # iterate over reads and strip them
    site = enzyme if add_site else ''
    for header in fhandler:
        header, seq, qal = get_seq(header)
        counter += 1
        # trim on wanted region of the read
        seq = strip_line(seq)
        qal = strip_line(qal)
        # get the generator of restriction enzyme fragments
        iter_frags = split_read(seq, qal, enz_pattern, len(seq), site)
        # the first fragment should not be preceded by the RE site
        try:
            seq, qal, cnt = iter_frags.next()
        except StopIteration:
            # read full of ligation events, fragments not reaching minimum
            continue
        except ValueError:
            # or not ligation site found, in which case we try with half
            # ligation site in case there was a sequencing error (half ligation
            # site is a RE site or nearly, and thus should not be found anyway)
            iter_frags = split_read(seq, qal, sub_enz_pattern, len(seq), '')
            try:
                seq, qal, cnt = iter_frags.next()
            except ValueError:
                continue
            except StopIteration:
                continue
        out.write(_map2fastq('\t'.join((insert_mark(header, cnt),
                                        seq, qal, '0', '-\n'))))
        # the next fragments should be preceded by the RE site
        # continue
        for seq, qal, cnt in  iter_frags:
            out.write(_map2fastq('\t'.join((insert_mark(header, cnt),
                                            seq, qal, '0', '-\n'))))
    out.close()
    return out_name, counter
示例#14
0
def transform_fastq(fastq_path, out_fastq, trim=None, r_enz=None, add_site=True,
                    min_seq_len=15, fastq=True, verbose=True,
                    light_storage=False, **kwargs):
    """
    Given a FASTQ file it can split it into chunks of a given number of reads,
    trim each read according to a start/end positions or split them into
    restriction enzyme fragments

    :param True add_site: when splitting the sequence by ligated sites found,
       removes the ligation site, and put back the original RE site.

    """
    skip = kwargs.get('skip', False)
    ## define local functions to process reads and sequences
    def _get_fastq_read_heavy(rlines):
        """
        returns header and sequence of 1 FASTQ entry
        Note: header also contains the sequence
        """
        rlines = rlines.rstrip('\n').split()[0][1:]
        seq = fhandler.next()
        _   = fhandler.next()  # lose qualities header but not needed
        qal = fhandler.next()  # lose qualities but not needed
        # header now also contains original read
        return (rlines + ' ' + seq.strip() + ' ' + qal.strip(),
                seq.strip(), qal.strip())

    def _get_fastq_read_light(rlines):
        """
        returns header and sequence of 1 FASTQ entry
        Note: header also contains the sequence
        """
        rlines = rlines.rstrip('\n').split()[0][1:]
        seq = fhandler.next()
        _   = fhandler.next()  # lose qualities header but not needed
        qal = fhandler.next()  # lose qualities but not needed
        return (rlines, seq.strip(), qal.strip())

    def _get_map_read_heavy(line):
        header = line.split('\t', 1)[0]
        seq, qal    = header.rsplit(' ', 2)[-2:]
        return header, seq, qal

    def _get_map_read_light(line):
        header, seq, qal, _ = line.split('\t', 3)
        return header, seq, qal

    def _inverse_find(seq, pat):
        try:
            return pat.search(seq).start()
        except AttributeError:
            return 'nan'

    def find_patterns(seq, patterns):
        pos, pat = min((_inverse_find(seq, patterns[p]), p) for p in patterns)
        return int(pos), pat

    def _split_read_re(seq, qal, patterns, site, max_seq_len=None, cnt=0):
        """
        Recursive generator that splits reads according to the
        predefined restriction enzyme.
        RE fragments yielded are followed and preceded by the RE site if a
        ligation site was found after the fragment.

        EXAMPLE:

           seq = '-------oGATCo========oGATCGATCo_____________oGATCGATCo~~~~~~~~~~~~'
           qal = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'

        should yield these fragments:

            -------oGATCo========oGATC
            xxxxxxxxxxxxxxxxxxxxxxHHHH

            GATCo_____________oGATC
            HHHHxxxxxxxxxxxxxxxHHHH

            GATCo~~~~~~~~~~~~
            HHHHxxxxxxxxxxxxx

        :param seq: sequence of the read fragment
        :param qal: quality of the sequence of the read fragment
        :param patterns: list of patterns of the ligated cut sites
        :param None max_seq_len: to control that all reads are bellow this
           length
        :param '' site: non-ligated cut site to replace ligation site
        :param 0 cnt: to count number of fragments

        :yields: seq fragments, their qualities and their count, or index
           (higher than 0 if ligation sites are found)
        """
        cnt += 1
        try:
            pos, (r_enz1, r_enz2) = find_patterns(seq, patterns)
        except ValueError:
            if len(seq) == max_seq_len:
                raise ValueError
            if len(seq) > min_seq_len:
                yield seq, qal, cnt
            return
        # add quality before corresponding to the space occupied by the cut-site
        xqal1 = ('H' * len(site[r_enz1]))
        xqal2 = ('H' * len(site[r_enz2]))
        if pos < min_seq_len:
            split_read(site[r_enz2] + seq[pos + len_relgs[(r_enz1, r_enz2)]:],
                       xqal2        + qal[pos + len_relgs[(r_enz1, r_enz2)]:],
                       patterns, no_site, max_seq_len, cnt=cnt)
        else:
            yield seq[:pos] + site[r_enz1], qal[:pos] + xqal1, cnt
        new_pos = pos + len_relgs[(r_enz1, r_enz2)]
        for sseq, sqal, cnt in split_read(site[r_enz2] + seq[new_pos:],
                                          xqal2 + qal[new_pos:], patterns,
                                          site, max_seq_len, cnt=cnt):
            yield sseq, sqal, cnt

    # Define function for stripping lines according to focus
    if isinstance(trim, tuple):
        beg, end = trim
        beg -= 1
        strip_line = lambda x: x[beg:end]
    else:
        strip_line = lambda x: x

    # define function to split reads according to restriction enzyme sites
    if isinstance(r_enz, str):
        r_enzs = [r_enz]
    elif isinstance(r_enz, list):
        r_enzs = r_enz
    else:
        r_enzs = None

    if r_enzs:
        enzymes = {}
        enz_patterns = {}
        for r_enz in r_enzs:
            enzymes[r_enz] = RESTRICTION_ENZYMES[r_enz].replace('|', '')
        enz_patterns = religateds(r_enzs)
        sub_enz_patterns = {}
        len_relgs = {}
        for r_enz1, r_enz2 in enz_patterns:
            sub_enz_patterns[(r_enz1, r_enz2)] = (
                enz_patterns[(r_enz1, r_enz2)][:len(enz_patterns[(r_enz1, r_enz2)])
                                               / 2])
            len_relgs[(r_enz1, r_enz2)] = len(enz_patterns[(r_enz1, r_enz2)])
        print '  - splitting into restriction enzyme (RE) fragments using ligation sites'
        print '  - ligation sites are replaced by RE sites to match the reference genome'
        for r_enz1 in r_enzs:
            for r_enz2 in r_enzs:
                print '    * enzymes: %s & %s, ligation site: %s, RE site: %s & %s' % (
                    r_enz1, r_enz2, enz_patterns[(r_enz1, r_enz2)],
                    enzymes[r_enz1], enzymes[r_enz2])
        # replace pattern with regex to support IUPAC annotation
        for ezp in enz_patterns:
            enz_patterns[ezp] = re.compile(iupac2regex(enz_patterns[ezp]))
        for ezp in sub_enz_patterns:
            sub_enz_patterns[ezp] = iupac2regex(sub_enz_patterns[ezp])
        split_read = _split_read_re
    else:
        enzymes = ''
        enz_patterns = ''
        sub_enz_patterns = ''
        split_read = lambda x, y, z, after_z, after_after_z: (yield x, y , 1)

    # function to yield reads from input file
    if light_storage:
        get_seq = _get_fastq_read_light if fastq else _get_map_read_light
        insert_mark = insert_mark_light
    else:
        get_seq = _get_fastq_read_heavy if fastq else _get_map_read_heavy
        insert_mark = insert_mark_heavy

    ## Start processing the input file
    if verbose:
        print 'Preparing %s file' % ('FASTQ' if fastq else 'MAP')
        if fastq:
            print '  - conversion to MAP format'
        if trim:
            print '  - trimming reads %d-%d' % tuple(trim)
    counter = 0
    if skip:
        if fastq:
            print '    ... skipping, only counting lines'
            counter = sum(1 for _ in magic_open(fastq_path,
                                                cpus=kwargs.get('nthreads')))
            counter /= 4 if fastq else 1
            print '            ' + fastq_path, counter, fastq
        return out_fastq, counter
    # open input file
    fhandler = magic_open(fastq_path, cpus=kwargs.get('nthreads'))
    # create output file
    out_name = out_fastq
    out = open(out_fastq, 'w')
    # iterate over reads and strip them
    no_site = dict([(r_enz, '') for r_enz in enzymes])
    site = enzymes if add_site else no_site
    for header in fhandler:
        header, seq, qal = get_seq(header)
        counter += 1
        # trim on wanted region of the read
        seq = strip_line(seq)
        qal = strip_line(qal)
        # get the generator of restriction enzyme fragments
        iter_frags = split_read(seq, qal, enz_patterns, site, len(seq))
        # the first fragment should not be preceded by the RE site
        try:
            seq, qal, cnt = iter_frags.next()
        except StopIteration:
            # read full of ligation events, fragments not reaching minimum
            continue
        except ValueError:
            # or not ligation site found, in which case we try with half
            # ligation site in case there was a sequencing error (half ligation
            # site is a RE site or nearly, and thus should not be found anyway)
            iter_frags = split_read(seq, qal, sub_enz_patterns, no_site, len(seq))
            try:
                seq, qal, cnt = iter_frags.next()
            except ValueError:
                continue
            except StopIteration:
                continue
        out.write(_map2fastq('\t'.join((insert_mark(header, cnt),
                                        seq, qal, '0', '-\n'))))
        # the next fragments should be preceded by the RE site
        # continue
        for seq, qal, cnt in  iter_frags:
            out.write(_map2fastq('\t'.join((insert_mark(header, cnt),
                                            seq, qal, '0', '-\n'))))
    out.close()
    return out_name, counter
示例#15
0
def parse_map(f_names1, f_names2=None, out_file1=None, out_file2=None,
              genome_seq=None, re_name=None, verbose=False, clean=True,
              **kwargs):
    """
    Parse map files

    Keep a summary of the results into 2 tab-separated files that will contain 6
       columns: read ID, Chromosome, position, strand (either 0 or 1), mapped
       sequence lebgth, position of the closest upstream RE site, position of
       the closest downstream RE site

    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read1, can also  be just one file
    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read2, can also  be just one file
    :param out_file1: path to outfile tab separated format containing mapped
       read1 information
    :param out_file1: path to outfile tab separated format containing mapped
       read2 information
    :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`.
       containing the genomic sequence
    :param re_name: name of the restriction enzyme used
    :param True clean: remove temporary files required for indentification of
       multiple-contacts
    """
    # not nice, dirty fix in order to allow this function to only parse
    # one SAM file
    if not out_file1:
        raise Exception('ERROR: out_file1 should be given\n')
    if not re_name:
        raise Exception('ERROR: re_name should be given\n')
    if not genome_seq:
        raise Exception('ERROR: genome_seq should be given\n')
    if (f_names2 and not out_file2) or (not f_names2 and out_file2):
        raise Exception('ERROR: out_file2 AND f_names2 needed\n')

    frag_chunk = kwargs.get('frag_chunk', 100000)
    if verbose:
        print 'Searching and mapping RE sites to the reference genome'
    frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk,
                         verbose=verbose)

    if isinstance(f_names1, str):
        f_names1 = [f_names1]
    if isinstance(f_names2, str):
        f_names2 = [f_names2]
    if f_names2:
        fnames = f_names1, f_names2
        outfiles = out_file1, out_file2
    else:
        fnames = (f_names1,)
        outfiles = (out_file1, )

    for read in range(len(fnames)):
        if verbose:
            print 'Loading read' + str(read + 1)
        windows = {}
        tmp_name = os.path.join(*outfiles[read].split('/')[:-1] +
                                ['tmp_' + outfiles[read].split('/')[-1]])
        tmp_name = ('/' * outfiles[read].startswith('/')) + tmp_name
        tmp_reads_fh = open(tmp_name, 'w')
        sorter = Popen(['sort', '-k', '1,1', '-s', '-t', '\t'], stdin=PIPE,
                       stdout=tmp_reads_fh)
        num = 0
        for fnam in fnames[read]:
            try:
                fhandler = magic_open(fnam)
            except IOError:
                warn('WARNING: file "%s" not found\n' % fnam)
                continue
            # get the iteration number of the iterative mapping
            try:
                num = int(fnam.split('.')[-1].split(':')[0])
            except:
                num += 1
            windows.setdefault(num, 0)
            if verbose:
                print 'loading file: %s' % (fnam)
            # iteration over reads
            for r in fhandler:
                name, seq, _, _, ali = r.split('\t')[:5]
                crm, strand, pos = ali.split(':')[:3]
                positive = strand == '+'
                len_seq  = len(seq)
                if positive:
                    pos = int(pos)
                else:
                    pos = int(pos) + len_seq - 1 # remove 1 because all inclusive
                try:
                    frag_piece = frags[crm][pos / frag_chunk]
                except KeyError:
                    # Chromosome not in hash
                    continue
                idx = bisect(frag_piece, pos)
                try:
                    next_re = frag_piece[idx]
                except IndexError:
                    # case where part of the read is mapped outside chromosome
                    count = 0
                    while idx >= len(frag_piece) and count < len_seq:
                        pos -= 1
                        count += 1
                        frag_piece = frags[crm][pos / frag_chunk]
                        idx = bisect(frag_piece, pos)
                    if count >= len_seq:
                        raise Exception('Read mapped mostly outside ' +
                                        'chromosome\n')
                    next_re = frag_piece[idx]
                prev_re = frag_piece[idx - 1 if idx else 0]
                sorter.stdin.write('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % (
                    name, crm, pos, positive, len_seq, prev_re, next_re))
                windows[num] += 1
        
        if verbose:
            print 'finishing to sort'
        sorter.communicate()
        tmp_reads_fh.close()

        if verbose:
            print 'Getting Multiple contacts'
        reads_fh = open(outfiles[read], 'w')
        ## Also pipe file header
        # chromosome sizes (in order)
        reads_fh.write('# Chromosome lengths (order matters):\n')
        for crm in genome_seq:
            reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm])))
        reads_fh.write('# Mapped\treads count by iteration\n')
        for size in windows:
            reads_fh.write('# MAPPED %d %d\n' % (size, windows[size]))

        ## Multicontacts
        tmp_reads_fh = open(tmp_name)
        read = tmp_reads_fh.next()
        prev_head = read.split('\t', 1)[0]
        prev_head = prev_head.split('~' , 1)[0]
        prev_read = read
        for read in tmp_reads_fh:
            head = read.split('\t', 1)[0]
            head = head.split('~' , 1)[0]
            if head == prev_head:
                prev_read =  prev_read.strip() + '|||' + read
            else:
                reads_fh.write(prev_read)
                prev_read = read
            prev_head = head
        reads_fh.write(prev_read)
        reads_fh.close()

        if clean:
            os.system('rm -rf ' + tmp_name)
示例#16
0
def transform_fastq(fastq_path, out_fastq, trim=None, r_enz=None, add_site=True,
                    min_seq_len=15, fastq=True, verbose=True):
    """
    Given a FASTQ file it can split it into chunks of a given number of reads,
    trim each read according to a start/end positions or split them into
    restriction enzyme fragments

    :param True add_site: when splitting the sequence by ligated sites found,
       removes the ligation site, and put back the original RE site.

    """
    ## define local funcitons to process reads and sequences
    def _get_fastq_read(rlines):
        """
        returns header and sequence of 1 FASTQ entry
        Note: header also contains the sequence
        """
        rlines = rlines.rstrip('\n').split()[0][1:]
        seq = fhandler.next()
        _   = fhandler.next()  # lose qualities header but not needed
        qal = fhandler.next()  # lose qualities but not needed
        # header now also contains original read
        return (rlines + ' ' + seq.strip() + ' ' + qal.strip(),
                seq.strip(), qal.strip())

    def _get_map_read(line):
        header = line.split('\t', 1)[0]
        seq, qal    = header.rsplit(' ', 2)[-2:]
        return header, seq, qal
        
    def _split_read_re(seq, qal, pattern, max_seq_len=None, site='', cnt=0):
        """
        Recursive generator that splits reads according to the
        predefined restriction enzyme.
        RE fragments yielded are followed by the RE site if a ligation
        site was found after the fragment.
        The RE site before the fragment is added outside this function
        """
        try:
            cnt += 1
            pos = seq.index(pattern)
            if pos < min_seq_len:
                split_read(seq[pos + len_relg:], qal[pos + len_relg:],
                           pattern, max_seq_len, cnt=cnt)
            else:
                yield seq[:pos] + site, qal[:pos] + ('H' * len(site)), cnt
            for subseq, subqal, cnt in split_read(seq[pos + len_relg:],
                                             qal[pos + len_relg:],
                                             pattern,
                                             max_seq_len, cnt=cnt):
                yield subseq, subqal, cnt
        except ValueError:
            if len(seq) == max_seq_len:
                raise ValueError
            if len(seq) > min_seq_len:
                yield seq, qal, cnt

    # Define function for stripping lines according to ficus
    if isinstance(trim, tuple):
        beg, end = trim
        beg -= 1
        strip_line = lambda x: x[beg:end]
    else:
        strip_line = lambda x: x

    # define function to split reads according to restriction enzyme sites
    if isinstance(r_enz, str):
        enzyme = RESTRICTION_ENZYMES[r_enz].replace('|', '')
        enz_pattern = religated(r_enz)
        sub_enz_pattern = enz_pattern[:len(enz_pattern) / 2]
        len_relg = len(enz_pattern)
        print '  - splitting into restriction enzyme (RE) fragments using ligation sites'
        print '  - ligation sites are replaced by RE sites to match the reference genome'
        print '    * enzyme: %s, ligation site: %s, RE site: %s' % (r_enz, enz_pattern, enzyme)
        split_read = _split_read_re
    else:
        enz_pattern = ''
        split_read = lambda x, y, z, after_z, after_after_z: (yield x, y , 1)

    # function to yield reads from input file
    get_seq = _get_fastq_read if fastq else _get_map_read

    ## Start processing the input file
    if verbose:
        print 'Preparing %s file' % ('FASTQ' if fastq else 'MAP')
        if fastq:
            print '  - conversion to MAP format'
        if trim:
            print '  - triming reads %d-%d' % tuple(trim)
            
    # open input file
    fhandler = magic_open(fastq_path)
    # create output file
    out_name = out_fastq
    out = open(out_fastq, 'w')
    # iterate over reads and strip them
    site = '' if add_site else enzyme
    for header in fhandler:
        header, seq, qal = get_seq(header)
        # trim on wanted region of the read
        seq = strip_line(seq)
        qal = strip_line(qal)
        # get the generator of restriction enzyme fragments
        iter_frags = split_read(seq, qal, enz_pattern, len(seq), site)
        # the first fragment should not be preceded by the RE site
        try:
            seq, qal, cnt = iter_frags.next()
        except StopIteration:
            # read full of ligation events, fragments not reaching minimum
            continue
        except ValueError:
            # or not ligation site found, in which case we try with half
            # ligation site in case there was a sequencing error (half ligation
            # site is a RE site or nearly, and thus should not be found anyway)
            iter_frags = split_read(seq, qal, sub_enz_pattern, len(seq), '')
            try:
                seq, qal, cnt = iter_frags.next()
            except ValueError:
                continue
            except StopIteration:
                continue
        out.write(_map2fastq('\t'.join((insert_mark(header, cnt),
                                        seq, qal, '0', '-\n'))))
        # the next fragments should be preceded by the RE site
        # continue
        for seq, qal, cnt in  iter_frags:
            out.write(_map2fastq('\t'.join((insert_mark(header, cnt),
                                            seq + site, qal + 'H' * (len(site)),
                                            '0', '-\n'))))
    out.close()
    return out_name
示例#17
0
def parse_fasta(f_names,
                chr_names=None,
                chr_filter=None,
                chr_regexp=None,
                verbose=True,
                save_cache=True,
                reload_cache=False,
                only_length=False):
    """
    Parse a list of fasta files, or just one fasta.

    WARNING: The order is important

    :param f_names: list of pathes to files, or just a single path
    :param None chr_names: pass list of chromosome names, or just one. If None
       are passed, then chromosome names will be inferred from fasta headers
    :param None chr_filter: use only chromosome in the input list
    :param None chr_regexp: use only chromosome matching
    :param True save_cache: save a cached version of this file for faster
       loadings (~4 times faster)
    :param False reload_cache: reload cached genome
    :param False only_length: returns dictionary with length of genome,not sequence

    :returns: a sorted dictionary with chromosome names as keys, and sequences
       as values (sequence in upper case)
    """
    if isinstance(f_names, str):
        f_names = [f_names]

    if len(f_names) == 1:
        fname = f_names[0] + '_genome.TADbit'
    else:
        fname = path.join(path.commonprefix(f_names), 'genome.TADbit')
    if path.exists(fname) and not reload_cache:
        if verbose:
            print 'Loading cached genome'
        genome_seq = OrderedDict()
        for line in open(fname):
            if line.startswith('>'):
                c = line[1:].strip()
            else:
                if only_length:
                    genome_seq[c] = len(line.strip())
                else:
                    genome_seq[c] = line.strip()
        return genome_seq

    if isinstance(chr_names, str):
        chr_names = [chr_names]

    if chr_filter:
        bad_chrom = lambda x: not x in chr_filter
    else:
        bad_chrom = lambda x: False

    if chr_regexp:
        chr_regexp = re.compile(chr_regexp)
    else:
        chr_regexp = re.compile('.*')

    genome_seq = OrderedDict()
    if len(f_names) == 1:
        header = None
        seq = []
        for line in magic_open(f_names[0]):
            if line.startswith('>'):
                if header:
                    genome_seq[header] = ''.join(seq).upper()
                header = line[1:].split()[0]
                if bad_chrom(header) or not chr_regexp.match(header):
                    header = 'UNWANTED'
                elif not chr_names:
                    if verbose:
                        print 'Parsing %s' % (header)
                else:
                    header = chr_names.pop(0)
                    if verbose:
                        print 'Parsing %s as %s' % (line[1:].rstrip(), header)
                seq = []
                continue
            seq.append(line.rstrip())
        if only_length:
            genome_seq[header] = len(seq)
        else:
            genome_seq[header] = ''.join(seq).upper()
        if 'UNWANTED' in genome_seq:
            del (genome_seq['UNWANTED'])
    else:
        for fnam in f_names:
            fhandler = magic_open(fnam)
            try:
                while True:
                    if not chr_names:
                        header = fhandler.next()
                        if header.startswith('>'):
                            header = header[1:].split()[0]
                            if bad_chrom(
                                    header) or not chr_regexp.match(header):
                                header = 'UNWANTED'
                            genome_seq[header] = ''
                            break
                    else:
                        _ = fhandler.next()
                        header = chr_names.pop(0)
                        if bad_chrom(header):
                            header = 'UNWANTED'
                        genome_seq[header] = ''
                        break
            except StopIteration:
                raise Exception('No crocodiles found, is it fasta?')
            if only_length:
                genome_seq[header] = sum(len(l.rstrip()) for l in fhandler)
            else:
                genome_seq[header] = ''.join([l.rstrip()
                                              for l in fhandler]).upper()
        if 'UNWANTED' in genome_seq:
            del (genome_seq['UNWANTED'])
    if save_cache and not only_length:
        if verbose:
            print 'saving genome in cache'
        if len(f_names) == 1:
            fname = f_names[0] + '_genome.TADbit'
        else:
            fname = path.join(path.commonprefix(f_names), 'genome.TADbit')
        out = open(fname, 'w')
        for c in genome_seq:
            out.write('>%s\n%s\n' % (c, genome_seq[c]))
        out.close()
    return genome_seq
示例#18
0
def transform_fastq(fastq_path,
                    out_fastq,
                    trim=None,
                    r_enz=None,
                    add_site=True,
                    min_seq_len=15,
                    fastq=True,
                    verbose=True):
    """
    Given a FASTQ file it can split it into chunks of a given number of reads,
    trim each read according to a start/end positions or split them into
    restriction enzyme fragments

    :param True add_site: when splitting the sequence by ligated sites found,
       removes the ligation site, and put back the original RE site.

    """

    ## define local funcitons to process reads and sequences
    def _get_fastq_read(rlines):
        """
        returns header and sequence of 1 FASTQ entry
        Note: header also contains the sequence
        """
        rlines = rlines.rstrip('\n').split()[0][1:]
        seq = fhandler.next()
        _ = fhandler.next()  # lose qualities header but not needed
        qal = fhandler.next()  # lose qualities but not needed
        # header now also contains original read
        return (rlines + ' ' + seq.strip() + ' ' + qal.strip(), seq.strip(),
                qal.strip())

    def _get_map_read(line):
        header = line.split('\t', 1)[0]
        seq, qal = header.rsplit(' ', 2)[-2:]
        return header, seq, qal

    def _split_read_re(seq, qal, pattern, max_seq_len=None, site=''):
        """
        Recursive generator that splits reads according to the
        predefined restriction enzyme.
        RE fragments yielded are followed by the RE site if a ligation
        site was found after the fragment.
        The RE site before the fragment is added outside this function
        """
        try:
            pos = seq.index(pattern)
            if pos < min_seq_len:
                split_read(seq[pos + len_relg:], qal[pos + len_relg:], pattern,
                           max_seq_len)
            else:
                yield seq[:pos] + site, qal[:pos] + ('H' * len(site))
            for subseq, subqal in split_read(seq[pos + len_relg:],
                                             qal[pos + len_relg:], pattern,
                                             max_seq_len):
                yield subseq, subqal
        except ValueError:
            if len(seq) == max_seq_len:
                raise ValueError
            if len(seq) > min_seq_len:
                yield seq, qal

    # Define function for stripping lines according to ficus
    if isinstance(trim, tuple):
        beg, end = trim
        beg -= 1
        strip_line = lambda x: x[beg:end]
    else:
        strip_line = lambda x: x

    # define function to split reads according to restriction enzyme sites
    if isinstance(r_enz, str):
        enzyme = RESTRICTION_ENZYMES[r_enz].replace('|', '')
        enz_pattern = religated(r_enz)
        sub_enz_pattern = enz_pattern[:len(enz_pattern) / 2]
        len_relg = len(enz_pattern)
        print '  - splitting into restriction enzyme (RE) fragments using ligation sites'
        print '  - ligation sites are replaced by RE sites to match the reference genome'
        print '    * enzyme: %s, ligation site: %s, RE site: %s' % (
            r_enz, enz_pattern, enzyme)
        split_read = _split_read_re
    else:
        enz_pattern = ''
        split_read = lambda x, y, z, after_z, after_after_z: (yield x, y)

    # function to yield reads from input file
    get_seq = _get_fastq_read if fastq else _get_map_read

    ## Start processing the input file
    if verbose:
        print 'Preparing %s file' % ('FASTQ' if fastq else 'MAP')
        if fastq:
            print '  - conversion to MAP format'
        if trim:
            print '  - triming reads %d-%d' % tuple(trim)

    # open input file
    fhandler = magic_open(fastq_path)
    # create output file
    out_name = out_fastq
    out = open(out_fastq, 'w')
    # iterate over reads and strip them
    site = '' if add_site else enzyme
    for header in fhandler:
        header, seq, qal = get_seq(header)
        # trim on wanted region of the read
        seq = strip_line(seq)
        qal = strip_line(qal)
        # get the generator of restriction enzyme fragments
        iter_frags = split_read(seq, qal, enz_pattern, len(seq), site)
        # the first fragment should not be preceded by the RE site
        try:
            seq, qal = iter_frags.next()
        except StopIteration:
            # read full of ligation events, fragments not reaching minimum
            continue
        except ValueError:
            # or not ligation site found, in which case we try with half
            # ligation site in case there was a sequencing error (half ligation
            # site is a RE site or nearly, and thus should not be found anyway)
            iter_frags = split_read(seq, qal, sub_enz_pattern, len(seq), site)
            try:
                seq, qal = iter_frags.next()
            except ValueError:
                continue
            except StopIteration:
                continue
        out.write(_map2fastq('\t'.join((header, seq, qal, '0', '-\n'))))
        # the next fragments should be preceded by the RE site
        # continue
        for seq, qal in iter_frags:
            out.write(
                _map2fastq('\t'.join((header, seq + site,
                                      qal + 'H' * (len(site)), '0', '-\n'))))
    out.close()
    return out_name
示例#19
0
def parse_fasta(f_names, chr_names=None, chr_filter=None, chr_regexp=None,
                verbose=True, save_cache=True, reload_cache=False, only_length=False):
    """
    Parse a list of fasta files, or just one fasta.

    WARNING: The order is important

    :param f_names: list of pathes to files, or just a single path
    :param None chr_names: pass list of chromosome names, or just one. If None
       are passed, then chromosome names will be inferred from fasta headers
    :param None chr_filter: use only chromosome in the input list
    :param None chr_regexp: use only chromosome matching
    :param True save_cache: save a cached version of this file for faster
       loadings (~4 times faster)
    :param False reload_cache: reload cached genome
    :param False only_length: returns dictionary with length of genome,not sequence

    :returns: a sorted dictionary with chromosome names as keys, and sequences
       as values (sequence in upper case)
    """
    if isinstance(f_names, str):
        f_names = [f_names]

    if len(f_names) == 1:
        fname = f_names[0] + '_genome.TADbit'
    else:
        fname = path.join(path.commonprefix(f_names), 'genome.TADbit')
    if path.exists(fname) and not reload_cache:
        if verbose:
            print 'Loading cached genome'
        genome_seq = OrderedDict()
        for line in open(fname):
            if line.startswith('>'):
                c = line[1:].strip()
            else:
                if only_length:
                    genome_seq[c] = len(line.strip())
                else:
                    genome_seq[c] = line.strip()
        return genome_seq

    if isinstance(chr_names, str):
        chr_names = [chr_names]

    if chr_filter:
        bad_chrom = lambda x: not x in chr_filter
    else:
        bad_chrom = lambda x: False

    if chr_regexp:
        chr_regexp = re.compile(chr_regexp)
    else:
        chr_regexp = re.compile('.*')

    genome_seq = OrderedDict()
    if len(f_names) == 1:
        header = None
        seq = []
        for line in magic_open(f_names[0]):
            if line.startswith('>'):
                if header:
                    genome_seq[header] = ''.join(seq).upper()
                header = line[1:].split()[0]
                if bad_chrom(header) or not chr_regexp.match(header):
                    header = 'UNWANTED'
                elif not chr_names:
                    if verbose:
                        print 'Parsing %s' % (header)
                else:
                    header = chr_names.pop(0)
                    if verbose:
                        print 'Parsing %s as %s' % (line[1:].rstrip(), header)
                seq = []
                continue
            seq.append(line.rstrip())
        if only_length:
            genome_seq[header] = len(seq)
        else:
            genome_seq[header] = ''.join(seq).upper()
        if 'UNWANTED' in genome_seq:
            del(genome_seq['UNWANTED'])
    else:
        for fnam in f_names:
            fhandler = magic_open(fnam)
            try:
                while True:
                    if not chr_names:
                        header = fhandler.next()
                        if header.startswith('>'):
                            header = header[1:].split()[0]
                            if bad_chrom(header) or not chr_regexp.match(header):
                                header = 'UNWANTED'
                            genome_seq[header] = ''
                            break
                    else:
                        _ = fhandler.next()
                        header = chr_names.pop(0)
                        if bad_chrom(header):
                            header = 'UNWANTED'
                        genome_seq[header] = ''
                        break
            except StopIteration:
                raise Exception('No crocodiles found, is it fasta?')
            if only_length:
                genome_seq[header] = sum(len(l.rstrip()) for l in fhandler)
            else:
                genome_seq[header] = ''.join([l.rstrip() for l in fhandler]).upper()
        if 'UNWANTED' in genome_seq:
            del(genome_seq['UNWANTED'])
    if save_cache and not only_length:
        if verbose:
            print 'saving genome in cache'
        if len(f_names) == 1:
            fname = f_names[0] + '_genome.TADbit'
        else:
            fname = path.join(path.commonprefix(f_names), 'genome.TADbit')
        out = open(fname, 'w')
        for c in genome_seq:
            out.write('>%s\n%s\n' % (c, genome_seq[c]))
        out.close()
    return genome_seq
示例#20
0
def save_to_db(opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig,
               nbad_columns, ncolumns, scc, std, reprod,
               eigen_corr_dat, eigen_corr_fig, outbed, corr, eig_corr,
               biases1, biases2, masked1, masked2, launch_time, finish_time):
    if 'tmpdb' in opts and opts.tmpdb:
        # check lock
        while path.exists(path.join(opts.workdir, '__lock_db')):
            time.sleep(0.5)
        # close lock
        open(path.join(opts.workdir, '__lock_db'), 'a').close()
        # tmp file
        dbfile = opts.tmpdb
        try: # to copy in case read1 was already mapped for example
            copyfile(path.join(opts.workdir, 'trace.db'), dbfile)
        except IOError:
            pass
    else:
        dbfile = path.join(opts.workdir, 'trace.db')
    con = lite.connect(dbfile)
    with con:
        cur = con.cursor()
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='MERGE_OUTPUTs'""")
        if not cur.fetchall():
            cur.execute("""
            create table PATHs
               (Id integer primary key,
                JOBid int, Path text, Type text,
                unique (Path))""")
            cur.execute("""
            create table JOBs
               (Id integer primary key,
                Parameters text,
                Launch_time text,
                Finish_time text,
                Type text,
                Parameters_md5 text,
                unique (Parameters_md5))""")
            cur.execute("""
            create table FILTER_OUTPUTs
               (Id integer primary key,
                PATHid int,
                Name text,
                Count int,
                JOBid int,
                unique (PATHid))""")
            cur.execute("""
            create table MERGE_OUTPUTs
               (Id integer primary key,
                JOBid int,
                Wrkd1Path int,
                Wrkd2Path int,
                Bed1Path int,
                Bed2Path int,
                MergePath int,
                unique (JOBid))""")
            cur.execute("""
            create table MERGE_STATs
               (Id integer primary key,
                JOBid int,
                Inputs text,
                decay_corr text,
                eigen_corr text,
                reprod real,
                scc real,
                std_scc real,
                N_columns int,
                N_filtered int,
                Resolution int,
                bias1Path int,
                bias2Path int,
                unique (JOBid))""")
        try:
            parameters = digest_parameters(opts, get_md5=False)
            param_hash = digest_parameters(opts, get_md5=True )
            cur.execute("""
            insert into JOBs
            (Id  , Parameters, Launch_time, Finish_time, Type   , Parameters_md5)
            values
            (NULL,       '%s',        '%s',        '%s', 'Merge',           '%s')
            """ % (parameters,
                   time.strftime("%d/%m/%Y %H:%M:%S", launch_time),
                   time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash))
        except lite.IntegrityError:
            pass

        jobid = get_jobid(cur)
        add_path(cur, decay_corr_dat, 'CORR'      , jobid, opts.workdir)
        add_path(cur, decay_corr_fig, 'FIGURE'    , jobid, opts.workdir)
        add_path(cur, eigen_corr_dat, 'CORR'      , jobid, opts.workdir)
        add_path(cur, eigen_corr_fig, 'FIGURE'    , jobid, opts.workdir)

        add_path(cur, opts.workdir , 'WORKDIR'    , jobid)
        add_path(cur, opts.workdir1, 'WORKDIR1'   , jobid, opts.workdir)
        add_path(cur, opts.workdir2, 'WORKDIR2'   , jobid, opts.workdir)
        add_path(cur, mreads1      , 'EXT_HIC_BAM', jobid, opts.workdir)
        add_path(cur, mreads2      , 'EXT_HIC_BAM', jobid, opts.workdir)
        if not opts.skip_merge:
            add_path(cur, outbed   , 'HIC_BAM'    , jobid, opts.workdir)

        if opts.norm:
            add_path(cur, biases1      , 'BIASES'     , jobid, opts.workdir)
            add_path(cur, biases2      , 'BIASES'     , jobid, opts.workdir)

            biasid1 = get_path_id(cur, biases1, opts.workdir)
            biasid2 = get_path_id(cur, biases2, opts.workdir)
        else:
            biasid1 = 0
            biasid2 = 0

        cur.execute("select id from paths where path = '%s'" % (
            path.relpath(mreads1, opts.workdir)))
        bed1 = cur.fetchall()[0][0]
        if opts.workdir1:
            cur.execute("select id from paths where path = '%s'" % (
                path.relpath(opts.workdir1, opts.workdir)))
            w1path = cur.fetchall()[0][0]
        else:
            w1path = 0
        cur.execute("select id from paths where path = '%s'" % (
            path.relpath(mreads2, opts.workdir)))
        bed2 = cur.fetchall()[0][0]
        if opts.workdir2:
            cur.execute("select id from paths where path = '%s'" % (
                path.relpath(opts.workdir2, opts.workdir)))
            w2path = cur.fetchall()[0][0]
        else:
            w2path = 0
        if not opts.skip_merge:
            cur.execute("select id from paths where path = '%s'" % (
                path.relpath(outbed, opts.workdir)))
            outbedid = cur.fetchall()[0][0]
        if not opts.skip_comparison:
            decay_corr = '-'.join(['%.1f' % (v)
                                   for v in corr[:10:2]]).replace('0.', '.')
            eigen_corr = '-'.join(['%.2f' % (max(v))
                                   for v in eig_corr[:4]]).replace('0.', '.')
        else:
            decay_corr = eigen_corr = None
        if not opts.skip_merge:
            cur.execute("""
            insert into MERGE_OUTPUTs
            (Id  , JOBid, Wrkd1Path, Wrkd2Path, Bed1Path, Bed2Path, MergePath)
            values
            (NULL,    %d,        %d,        %d,       %d,       %d,        %d)
            """ % (jobid,    w1path,    w2path,     bed1,     bed2,  outbedid))

        if not opts.skip_comparison:
            cur.execute("""
            insert into MERGE_STATs
            (Id  , JOBid, N_columns,   N_filtered, Resolution, reprod, scc, std_scc, decay_corr, eigen_corr, bias1Path, bias2Path)
            values
            (NULL,    %d,        %d,           %d,         %d,     %f,  %f,      %f,       '%s',       '%s',        %d,        %d)
            """ % (jobid,  ncolumns, nbad_columns, opts.reso , reprod, scc,     std, decay_corr, eigen_corr,   biasid1,   biasid2))

        if opts.workdir1:
            if 'tmpdb' in opts and opts.tmpdb:
                # tmp file
                dbfile1 = opts.tmpdb1
                try: # to copy in case read1 was already mapped for example
                    copyfile(path.join(opts.workdir1, 'trace.db'), dbfile1)
                except IOError:
                    pass
            else:
                dbfile1 = path.join(opts.workdir1, 'trace.db')
            tmpcon = lite.connect(dbfile1)
            with tmpcon:
                tmpcur = tmpcon.cursor()
                tmpcur.execute("select Name, PATHid, Count from filter_outputs")
                for name, pathid, count in tmpcur.fetchall():
                    res = tmpcur.execute("select Path from PATHs where Id = %d" % (pathid))
                    tmppath = res.fetchall()[0][0]
                    masked1[name] = {'path': tmppath, 'count': count}
            if 'tmpdb' in opts and opts.tmpdb:
                remove(dbfile1)
        if opts.workdir2:
            if 'tmpdb' in opts and opts.tmpdb:
                # tmp file
                dbfile2 = opts.tmpdb2
                try: # to copy in case read2 was already mapped for example
                    copyfile(path.join(opts.workdir2, 'trace.db'), dbfile2)
                except IOError:
                    pass
            else:
                dbfile2 = path.join(opts.workdir2, 'trace.db')
            tmpcon = lite.connect(dbfile2)
            with tmpcon:
                tmpcur = tmpcon.cursor()
                tmpcur.execute("select Name, PATHid, Count from filter_outputs")
                for name, pathid, count in tmpcur.fetchall():
                    res = tmpcur.execute("select Path from PATHs where Id = %d" % (pathid))
                    tmppath = res.fetchall()[0][0]
                    masked2[name] = {'path': tmppath, 'count': count}
            if 'tmpdb' in opts and opts.tmpdb:
                remove(dbfile2)

        for f in masked1:
            if f  != 'valid-pairs':
                outmask = path.join(opts.workdir, '03_filtered_reads',
                                    'all_r1-r2_intersection_%s.tsv_%s.tsv' % (
                                        param_hash, f))
                out = open(outmask, 'w')
                try:
                    fh = magic_open(path.join(opts.workdir1, masked1[f]['path']))
                except FileNotFoundError:
                    fh = magic_open(path.join(opts.workdir1, masked1[f]['path'] + '.gz'))
                for line in fh:
                    out.write(line)
                try:
                    fh = magic_open(path.join(opts.workdir2, masked2[f]['path']))
                except FileNotFoundError:
                    fh = magic_open(path.join(opts.workdir2, masked2[f]['path'] + '.gz'))
                for line in fh:
                    out.write(line)
                add_path(cur, outmask, 'FILTER', jobid, opts.workdir)
            else:
                if opts.skip_merge:
                    outmask = 'NA'
                else:
                    outmask = outbed
            try:
                path_id = get_path_id(cur, outmask, opts.workdir)
            except IndexError:
                path_id = -1
            cur.execute("""
            insert into FILTER_OUTPUTs
            (Id  , PATHid, Name, Count, JOBid)
            values
            (NULL,     %d, '%s',  '%s',    %d)
            """ % (path_id, f, masked1[f]['count'] + masked2[f]['count'], jobid))

        print_db(cur, 'PATHs')
        print_db(cur, 'JOBs')
        print_db(cur, 'MERGE_OUTPUTs')
        print_db(cur, 'MERGE_STATs')
        print_db(cur, 'FILTER_OUTPUTs')

    if 'tmpdb' in opts and opts.tmpdb:
        # copy back file
        copyfile(dbfile, path.join(opts.workdir, 'trace.db'))
        remove(dbfile)
    # release lock
    try:
        remove(path.join(opts.workdir, '__lock_db'))
    except OSError:
        pass
示例#21
0
def transform_fastq(fastq_path, out_fastq, trim=None, r_enz=None,
                    min_seq_len=20, fastq=True, verbose=True):
    """
    Given a FASTQ file it can split it into chunks of a given number of reads,
    trim each read according to a start/end positions or split them into
    restriction enzyme fragments
    """
    ## define local funcitons to process reads and sequences
    def _get_fastq_read(rlines):
        """
        returns header and sequence of 1 FASTQ entry
        """
        rlines = rlines.rstrip('\n')
        line = fhandler.next()
        _ = fhandler.next()  # lose qualities but not needed
        _ = fhandler.next()  # lose qualities but not needed
        return rlines, line.strip()

    def _split_read_re(x, max_seq_len=None):
        """
        Recursive generator that splits reads according to the
        predefined restriction enzyme.
        RE fragments yielded are followed by the RE site if a ligation
        site was found after the fragment.
        The RE site before the fragment is added outside this function
        """
        try:
            pos = x.index(enz_pattern)
            if pos < min_seq_len:
                split_read(x[pos + len_relg:], max_seq_len)
            else:
                yield x[:pos] + enzyme
            for x in split_read(x[pos + len_relg:], max_seq_len):
                yield x
        except ValueError:
            if len(x) > min_seq_len:
                if len(x) == max_seq_len:
                    raise StopIteration
                yield x

    # Define function for stripping lines according to ficus
    if isinstance(trim, tuple):
        beg, end = trim
        strip_line = lambda x: x[beg:end]
    else:
        strip_line = lambda x: x

    # define function to split reads according to restriction enzyme sites
    if isinstance(r_enz, str):
        enzyme = RESTRICTION_ENZYMES[r_enz].replace('|', '')
        enz_pattern = religated(r_enz)
        len_relg = len(enz_pattern)
        print '  - splitting into restriction enzyme (RE) fragments using ligation sites'
        print '  - ligation sites are replaced by RE sites to match the reference genome'
        print '    * enzyme: %s, ligation site: %s, RE site: %s' % (r_enz, enz_pattern, enzyme)
        split_read = _split_read_re
    else:
        split_read = lambda x, y: (yield x)

    # function to yield reads from input file
    get_seq = _get_fastq_read if fastq else lambda x: x.split('\t', 2)[:2]

    ## Start processing the input file
    if verbose:
        print 'Preparing %s file' % ('FASTQ' if fastq else 'MAP')
        if fastq:
            print '  - conversion to MAP format'
        if trim:
            print '  - triming reads %d-%d' % tuple(trim)


    # open input file
    fhandler = magic_open(fastq_path)
    # create output file
    out_name = out_fastq
    out = open(out_fastq, 'w')
    # iterate over reads and strip them
    for header in fhandler:
        header, line = get_seq(header)
        # trim on wanted region of the read
        line = strip_line(line)
        # get the generator of restriction enzyme fragments
        iter_frags = split_read(line, len(line))
        # the first fragment should not be preceded by the RE site
        try:
            frag = iter_frags.next()
        except StopIteration:
            # read full of ligation events, fragments not reaching minimum
            continue
        out.write('\t'.join((header, frag, 'H' * len(frag), '0', '-\n')))
        # the next fragments should be preceded by the RE site
        for frag in  iter_frags:
            out.write('\t'.join((header, frag + enzyme,
                                 'H' * (len(frag) + len(enzyme)), '0', '-\n')))
    out.close()
    return out_name
示例#22
0
def parse_map(f_names1, f_names2=None, out_file1=None, out_file2=None,
              genome_seq=None, re_name=None, verbose=False, clean=True,
              **kwargs):
    """
    Parse map files

    Keep a summary of the results into 2 tab-separated files that will contain 6
    columns: read ID, Chromosome, position, strand (either 0 or 1), mapped
    sequence lebgth, position of the closest upstream RE site, position of
    the closest downstream RE site.

    The position of reads mapped on reverse strand will be computed from the end of
    the read (original position + read length - 1)

    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read1, can also  be just one file
    :param f_names2: a list of path to sam/bam files corresponding to the
       mapping of read2, can also  be just one file
    :param out_file1: path to outfile tab separated format containing mapped
       read1 information
    :param out_file2: path to outfile tab separated format containing mapped
       read2 information
    :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`.
       containing the genomic sequence
    :param re_name: name of the restriction enzyme used
    :param True clean: remove temporary files required for indentification of
       multiple-contacts
    :param False compress: compress (gzip) input map files. This is done in the
       background while next MAP files are parsed, or while files are sorted.
    """
    # not nice, dirty fix in order to allow this function to only parse
    # one SAM file
    if not out_file1:
        raise Exception('ERROR: out_file1 should be given\n')
    if not re_name:
        raise Exception('ERROR: re_name should be given\n')
    if not genome_seq:
        raise Exception('ERROR: genome_seq should be given\n')
    if (f_names2 and not out_file2) or (not f_names2 and out_file2):
        raise Exception('ERROR: out_file2 AND f_names2 needed\n')

    frag_chunk = kwargs.get('frag_chunk', 100000)
    if verbose:
        print 'Searching and mapping RE sites to the reference genome'
    frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk,
                         verbose=verbose)

    if isinstance(f_names1, str):
        f_names1 = [f_names1]
    if isinstance(f_names2, str):
        f_names2 = [f_names2]
    if f_names2:
        fnames = f_names1, f_names2
        outfiles = out_file1, out_file2
    else:
        fnames = (f_names1,)
        outfiles = (out_file1, )

    # max number of reads per intermediate files for sorting
    max_size = 1000000
    
    windows = {}
    multis  = {}
    procs   = []
    for read in range(len(fnames)):
        if verbose:
            print 'Loading read' + str(read + 1)
        windows[read] = {}
        num = 0
        # iteration over reads
        nfile = 0
        tmp_files = []
        reads     = []
        for fnam in fnames[read]:
            try:
                fhandler = magic_open(fnam)
            except IOError:
                warn('WARNING: file "%s" not found\n' % fnam)
                continue
            # get the iteration number of the iterative mapping
            try:
                num = int(fnam.split('.')[-1].split(':')[0])
            except:
                num += 1
            # set read counter
            if verbose:
                print 'loading file: %s' % (fnam)
            # start parsing
            read_count = 0
            try:
                while not False:
                    for _ in xrange(max_size):
                        try:
                            reads.append(read_read(fhandler.next(), frags,
                                                   frag_chunk))
                        except KeyError:
                            # Chromosome not in hash
                            continue
                        read_count += 1
                    nfile += 1
                    write_reads_to_file(reads, outfiles[read], tmp_files, nfile)
            except StopIteration:
                fhandler.close()
                nfile += 1
                write_reads_to_file(reads, outfiles[read], tmp_files, nfile)
            windows[read][num] = read_count
            if kwargs.get('compress', False) and fnam.endswith('.map'):
                print 'compressing input MAP file'
                procs.append(Popen(['gzip', fnam]))
        nfile += 1
        write_reads_to_file(reads, outfiles[read], tmp_files, nfile)

        # we have now sorted temporary files
        # we do merge sort for eah pair
        if verbose:
            stdout.write('Merge sort')
            stdout.flush()
        while len(tmp_files) > 1:
            file1 = tmp_files.pop(0)
            try:
                file2 = tmp_files.pop(0)
            except IndexError:
                break
            if verbose:
                stdout.write('.')
            stdout.flush()
            nfile += 1
            tmp_files.append(merge_sort(file1, file2, outfiles[read], nfile))
        if verbose:
            stdout.write('\n')
        tmp_name = tmp_files[0]
        
        if verbose:
            print 'Getting Multiple contacts'
        reads_fh = open(outfiles[read], 'w')
        ## Also pipe file header
        # chromosome sizes (in order)
        reads_fh.write('# Chromosome lengths (order matters):\n')
        for crm in genome_seq:
            reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm])))
        reads_fh.write('# Mapped\treads count by iteration\n')
        for size in windows[read]:
            reads_fh.write('# MAPPED %d %d\n' % (size, windows[read][size]))

        ## Multicontacts
        tmp_reads_fh = open(tmp_name)
        try:
            read_line = tmp_reads_fh.next()
        except StopIteration:
            raise StopIteration('ERROR!\n Nothing parsed, check input files and'
                                ' chromosome names (in genome.fasta and SAM/MAP'
                                ' files).')
        prev_head = read_line.split('\t', 1)[0]
        prev_head = prev_head.split('~' , 1)[0]
        prev_read = read_line
        multis[read] = 0
        for read_line in tmp_reads_fh:
            head = read_line.split('\t', 1)[0]
            head = head.split('~' , 1)[0]
            if head == prev_head:
                multis[read] += 1
                prev_read =  prev_read.strip() + '|||' + read_line
            else:
                reads_fh.write(prev_read)
                prev_read = read_line
            prev_head = head
        reads_fh.write(prev_read)
        reads_fh.close()
        if clean:
            os.system('rm -rf ' + tmp_name)
    # wait for compression to finish
    for p in procs:
        p.communicate()
    return windows, multis
示例#23
0
def parse_fasta(f_names, chr_names=None, chr_filter=None, chr_regexp=None,
                verbose=True):
    """
    Parse a list of fasta files, or just one fasta.

    WARNING: The order is important

    :param f_names: list of pathes to files, or just a single path
    :param None chr_names: pass list of chromosome names, or just one. If None
       are passed, then chromosome names will be inferred from fasta headers
    :param None chr_filter: use only chromosome in the input list
    :param None chr_regexp: use only chromosome matching

    :returns: a sorted dictionary with chromosome names as keys, and sequences
       as values (sequence in upper case)
    """
    if isinstance(f_names, str):
        f_names = [f_names]
    if isinstance(chr_names, str):
        chr_names = [chr_names]

    if chr_filter:
        bad_chrom = lambda x: not x in chr_filter
    else:
        bad_chrom = lambda x: False

    if chr_regexp:
        chr_regexp = re.compile(chr_regexp)
    else:
        chr_regexp = re.compile('.*')

    genome_seq = OrderedDict()
    if len(f_names) == 1:
        header = None
        seq = []
        for line in magic_open(f_names[0]):
            if line.startswith('>'):
                if header:
                    genome_seq[header] = ''.join(seq).upper()
                header = line[1:].split()[0]
                if bad_chrom(header) or not chr_regexp.match(header):
                    header = 'UNWANTED'
                elif not chr_names:
                    if verbose:
                        print 'Parsing %s' % (header)
                else:
                    header = chr_names.pop(0)
                    if verbose:
                        print 'Parsing %s as %s' % (line[1:].rstrip(), header)
                seq = []
                continue
            seq.append(line.rstrip())
        genome_seq[header] = ''.join(seq).upper()
        if 'UNWANTED' in genome_seq:
            del(genome_seq['UNWANTED'])
    else:
        for fnam in f_names:
            fhandler = magic_open(fnam)
            try:
                while True:
                    if not chr_names:
                        header = fhandler.next()
                        if header.startswith('>'):
                            header = header[1:].split()[0]
                            if bad_chrom(header) or not chr_regexp.match(header):
                                header = 'UNWANTED'
                            genome_seq[header] = ''
                            break
                    else:
                        _ = fhandler.next()
                        header = chr_names.pop(0)
                        if bad_chrom(header):
                            header = 'UNWANTED'
                        genome_seq[header] = ''
                        break
            except StopIteration:
                raise Exception('No crocodiles found, is it fasta?')
            genome_seq[header] = ''.join([l.rstrip() for l in fhandler]).upper()
        if 'UNWANTED' in genome_seq:
            del(genome_seq['UNWANTED'])
    return genome_seq