Exemplo n.º 1
0
 def test_17_map_re_sites(self):
     """
     test fasta parsing and mapping re sites
     """
     ref_genome = parse_fasta(PATH + '/ref_genome/chr2L_chr4_dm3.bz2',
                              verbose=False)
     self.assertEqual(len(ref_genome['chr4']), 1351857)
     frags = map_re_sites('dpnIi', ref_genome)
     self.assertEqual(len(frags['chr2L']), 231)
     self.assertEqual(len(frags['chr2L'][230]), 16)
     self.assertEqual(frags['chr4'][10][50], 1018069)
     frags = map_re_sites('hindiii', ref_genome)
     self.assertEqual(len(frags['chr2L']), 231)
     self.assertEqual(len(frags['chr2L'][230]), 3)
     self.assertEqual(frags['chr4'][10][5], 1017223)
Exemplo n.º 2
0
 def test_17_map_re_sites(self):
     """
     test fasta parsing and mapping re sites
     """
     if ONLY and ONLY != "17":
         return
     if CHKTIME:
         t0 = time()
     ref_genome = parse_fasta(PATH + "/ref_genome/chr2L_chr4_dm3.bz2", verbose=False)
     self.assertEqual(len(ref_genome["chr4"]), 1351857)
     frags = map_re_sites("dpnIi", ref_genome)
     self.assertEqual(len(frags["chr2L"]), 231)
     self.assertEqual(len(frags["chr2L"][230]), 16)
     self.assertEqual(frags["chr4"][10][50], 1018069)
     frags = map_re_sites("hindiii", ref_genome)
     self.assertEqual(len(frags["chr2L"]), 231)
     self.assertEqual(len(frags["chr2L"][230]), 3)
     self.assertEqual(frags["chr4"][10][5], 1017223)
     if CHKTIME:
         self.assertEqual(True, True)
         print "17", time() - t0
Exemplo n.º 3
0
 def test_17_map_re_sites(self):
     """
     test fasta parsing and mapping re sites
     """
     if ONLY and "17" not in ONLY:
         return
     if CHKTIME:
         t0 = time()
     ref_genome = parse_fasta(PATH + "/ref_genome/chr2L_chr4_dm3.bz2",
                              verbose=False)
     self.assertEqual(len(ref_genome["chr4"]), 1351857)
     frags = map_re_sites("dpnIi", ref_genome)
     self.assertEqual(len(frags["chr2L"]), 231)
     self.assertEqual(len(frags["chr2L"][230]), 16)
     self.assertEqual(frags["chr4"][10][50], 1018069)
     frags = map_re_sites("hindiii", ref_genome)
     self.assertEqual(len(frags["chr2L"]), 231)
     self.assertEqual(len(frags["chr2L"][230]), 3)
     self.assertEqual(frags["chr4"][10][5], 1017223)
     if CHKTIME:
         self.assertEqual(True, True)
         print "17", time() - t0
Exemplo n.º 4
0
 def test_17_map_re_sites(self):
     """
     test fasta parsing and mapping re sites
     """
     if ONLY and ONLY != '17':
         return
     if CHKTIME:
         t0 = time()
     ref_genome = parse_fasta(PATH + '/ref_genome/chr2L_chr4_dm3.bz2',
                              verbose=False)
     self.assertEqual(len(ref_genome['chr4']), 1351857)
     frags = map_re_sites('dpnIi', ref_genome)
     self.assertEqual(len(frags['chr2L']), 231)
     self.assertEqual(len(frags['chr2L'][230]), 16)
     self.assertEqual(frags['chr4'][10][50], 1018069)
     frags = map_re_sites('hindiii', ref_genome)
     self.assertEqual(len(frags['chr2L']), 231)
     self.assertEqual(len(frags['chr2L'][230]), 3)
     self.assertEqual(frags['chr4'][10][5], 1017223)
     if CHKTIME:
         self.assertEqual(True, True)
         print '17', time() - t0
Exemplo n.º 5
0
 def test_17_map_re_sites(self):
     """
     test fasta parsing and mapping re sites
     """
     if ONLY and ONLY != '17':
         return
     if CHKTIME:
         t0 = time()
     ref_genome = parse_fasta(PATH + '/ref_genome/chr2L_chr4_dm3.bz2',
                              verbose=False)
     self.assertEqual(len(ref_genome['chr4']), 1351857)
     frags = map_re_sites('dpnIi', ref_genome)
     self.assertEqual(len(frags['chr2L']), 231)
     self.assertEqual(len(frags['chr2L'][230]), 16)
     self.assertEqual(frags['chr4'][10][50], 1018069)
     frags = map_re_sites('hindiii', ref_genome)
     self.assertEqual(len(frags['chr2L']), 231)
     self.assertEqual(len(frags['chr2L'][230]), 3)
     self.assertEqual(frags['chr4'][10][5], 1017223)
     if CHKTIME:
         self.assertEqual(True, True)
         print '17', time() - t0
Exemplo n.º 6
0
def parse_sam(f_names1, f_names2=None, out_file1=None, out_file2=None,
              genome_seq=None, re_name=None, verbose=False, clean=True,
              mapper=None, **kwargs):
    """
    Parse sam/bam file using pysam tools.

    Keep a summary of the results into 2 tab-separated files that will contain 6
       columns: read ID, Chromosome, position, strand (either 0 or 1), mapped
       sequence lebgth, position of the closest upstream RE site, position of
       the closest downstream RE site

    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read1, can also  be just one file
    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read2, can also  be just one file
    :param out_file1: path to outfile tab separated format containing mapped
       read1 information
    :param out_file1: path to outfile tab separated format containing mapped
       read2 information
    :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`.
       containing the genomic sequence
    :param re_name: name of the restriction enzyme used
    :param None mapper: software used to map (supported are GEM and BOWTIE2).
       Guessed from file by default.
    """
    # not nice, dirty fix in order to allow this function to only parse
    # one SAM file
    if not out_file1:
        raise Exception('ERROR: out_file1 should be given\n')
    if not re_name:
        raise Exception('ERROR: re_name should be given\n')
    if not genome_seq:
        raise Exception('ERROR: genome_seq should be given\n')
    if (f_names2 and not out_file2) or (not f_names2 and out_file2):
        raise Exception('ERROR: out_file2 AND f_names2 needed\n')

    frag_chunk = kwargs.get('frag_chunk', 100000)
    if verbose:
        print 'Searching and mapping RE sites to the reference genome'
    frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk,
                         verbose=verbose)

    if isinstance(f_names1, str):
        f_names1 = [f_names1]
    if isinstance(f_names2, str):
        f_names2 = [f_names2]
    if f_names2:
        fnames = f_names1, f_names2
        outfiles = out_file1, out_file2
    else:
        fnames = (f_names1,)
        outfiles = (out_file1, )

    # max number of reads per intermediate files for sorting
    max_size = 1000000

    windows = {}
    multis  = {}
    procs   = []
    for read in range(len(fnames)):
        if verbose:
            print 'Loading read' + str(read + 1)
        windows[read] = {}
        num = 0
        # iteration over reads
        nfile = 0
        tmp_files = []
        reads     = []
        for fnam in fnames[read]:
            try:
                fhandler = Samfile(fnam)
            except IOError:
                print 'WARNING: file "%s" not found' % fnam
                continue
            except ValueError:
                raise Exception('ERROR: not a SAM/BAM file\n%s' % fnam)
            # get the iteration number of the iterative mapping
            try:
                num = int(fnam.split('.')[-1].split(':')[0])
            except:
                num += 1
            # set read counter
            windows[read].setdefault(num, 0)
            # guess mapper used
            if not mapper:
                mapper = fhandler.header['PG'][0]['ID']
            if mapper.lower()=='gem':
                condition = lambda x: x[1][0][0] != 'N'
            elif mapper.lower() in ['bowtie', 'bowtie2']:
                condition = lambda x: 'XS' in dict(x)
            else:
                warn('WARNING: unrecognized mapper used to generate file\n')
                condition = lambda x: x[1][1] != 1
            if verbose:
                print 'loading SAM file from %s: %s' % (mapper, fnam)
            # getrname chromosome names
            i = 0
            crm_dict = {}
            while True:
                try:
                    crm_dict[i] = fhandler.getrname(i)
                    i += 1
                except ValueError:
                    break
            # iteration over reads
            sub_count = 0  # to empty read buffer
            for r in fhandler:
                if r.is_unmapped:
                    continue
                if condition(r.tags):
                    continue
                positive = not r.is_reverse
                crm      = crm_dict[r.tid]
                len_seq  = len(r.seq)
                if positive:
                    pos = r.pos + 1
                else:
                    pos = r.pos + len_seq
                try:
                    frag_piece = frags[crm][pos / frag_chunk]
                except KeyError:
                    # Chromosome not in hash
                    continue
                idx = bisect(frag_piece, pos)
                try:
                    next_re = frag_piece[idx]
                except IndexError:
                    # case where part of the read is mapped outside chromosome
                    count = 0
                    while idx >= len(frag_piece) and count < len_seq:
                        pos -= 1
                        count += 1
                        frag_piece = frags[crm][pos / frag_chunk]
                        idx = bisect(frag_piece, pos)
                    if count >= len_seq:
                        raise Exception('Read mapped mostly outside ' +
                                        'chromosome\n')
                    next_re    = frag_piece[idx]
                prev_re    = frag_piece[idx - 1 if idx else 0]
                name       = r.qname
                reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % (
                    name, crm, pos, positive, len_seq, prev_re, next_re))
                windows[read][num] += 1
                sub_count += 1
                if sub_count >= max_size:
                    sub_count = 0
                    nfile += 1
                    write_reads_to_file(reads, outfiles[read], tmp_files, nfile)
            nfile += 1
            write_reads_to_file(reads, outfiles[read], tmp_files, nfile)


        # we have now sorted temporary files
        # we do merge sort for eah pair
        if verbose:
            stdout.write('Merge sort')
            stdout.flush()
        while len(tmp_files) > 1:
            file1 = tmp_files.pop(0)
            try:
                file2 = tmp_files.pop(0)
            except IndexError:
                break
            if verbose:
                stdout.write('.')
            stdout.flush()
            nfile += 1
            tmp_files.append(merge_sort(file1, file2, outfiles[read], nfile))
        if verbose:
            stdout.write('\n')
        tmp_name = tmp_files[0]
        
        if verbose:
            print 'Getting Multiple contacts'
        reads_fh = open(outfiles[read], 'w')
        ## Also pipe file header
        # chromosome sizes (in order)
        reads_fh.write('# Chromosome lengths (order matters):\n')
        for crm in genome_seq:
            reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm])))
        reads_fh.write('# Mapped\treads count by iteration\n')
        for size in windows[read]:
            reads_fh.write('# MAPPED %d %d\n' % (size, windows[read][size]))

        ## Multicontacts
        tmp_reads_fh = open(tmp_name)
        try:
            read_line = tmp_reads_fh.next()
        except StopIteration:
            raise StopIteration('ERROR!\n Nothing parsed, check input files and'
                                ' chromosome names (in genome.fasta and SAM/MAP'
                                ' files).')
        prev_head = read_line.split('\t', 1)[0]
        prev_head = prev_head.split('~' , 1)[0]
        prev_read = read_line
        multis[read] = 0
        for read_line in tmp_reads_fh:
            head = read_line.split('\t', 1)[0]
            head = head.split('~' , 1)[0]
            if head == prev_head:
                multis[read] += 1
                prev_read =  prev_read.strip() + '|||' + read_line
            else:
                reads_fh.write(prev_read)
                prev_read = read_line
            prev_head = head
        reads_fh.write(prev_read)
        reads_fh.close()
        if clean:
            os.system('rm -rf ' + tmp_name)
    # wait for compression to finish
    for p in procs:
        p.communicate()
    return windows, multis
Exemplo n.º 7
0
def parse_map(f_names1, f_names2=None, out_file1=None, out_file2=None,
              genome_seq=None, re_name=None, verbose=False, clean=True,
              **kwargs):
    """
    Parse map files

    Keep a summary of the results into 2 tab-separated files that will contain 6
    columns: read ID, Chromosome, position, strand (either 0 or 1), mapped
    sequence lebgth, position of the closest upstream RE site, position of
    the closest downstream RE site.

    The position of reads mapped on reverse strand will be computed from the end of
    the read (original position + read length - 1)

    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read1, can also  be just one file
    :param f_names2: a list of path to sam/bam files corresponding to the
       mapping of read2, can also  be just one file
    :param out_file1: path to outfile tab separated format containing mapped
       read1 information
    :param out_file2: path to outfile tab separated format containing mapped
       read2 information
    :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`.
       containing the genomic sequence
    :param re_name: name of the restriction enzyme used
    :param True clean: remove temporary files required for indentification of
       multiple-contacts
    :param False compress: compress (gzip) input map files. This is done in the
       background while next MAP files are parsed, or while files are sorted.
    """
    # not nice, dirty fix in order to allow this function to only parse
    # one SAM file
    if not out_file1:
        raise Exception('ERROR: out_file1 should be given\n')
    if not re_name:
        raise Exception('ERROR: re_name should be given\n')
    if not genome_seq:
        raise Exception('ERROR: genome_seq should be given\n')
    if (f_names2 and not out_file2) or (not f_names2 and out_file2):
        raise Exception('ERROR: out_file2 AND f_names2 needed\n')

    frag_chunk = kwargs.get('frag_chunk', 100000)
    if verbose:
        print 'Searching and mapping RE sites to the reference genome'
    frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk,
                         verbose=verbose)

    if isinstance(f_names1, str):
        f_names1 = [f_names1]
    if isinstance(f_names2, str):
        f_names2 = [f_names2]
    if f_names2:
        fnames = f_names1, f_names2
        outfiles = out_file1, out_file2
    else:
        fnames = (f_names1,)
        outfiles = (out_file1, )

    # max number of reads per intermediate files for sorting
    max_size = 1000000
    
    windows = {}
    multis  = {}
    procs   = []
    for read in range(len(fnames)):
        if verbose:
            print 'Loading read' + str(read + 1)
        windows[read] = {}
        num = 0
        # iteration over reads
        nfile = 0
        tmp_files = []
        reads     = []
        for fnam in fnames[read]:
            try:
                fhandler = magic_open(fnam)
            except IOError:
                warn('WARNING: file "%s" not found\n' % fnam)
                continue
            # get the iteration number of the iterative mapping
            try:
                num = int(fnam.split('.')[-1].split(':')[0])
            except:
                num += 1
            # set read counter
            if verbose:
                print 'loading file: %s' % (fnam)
            # start parsing
            read_count = 0
            try:
                while not False:
                    for _ in xrange(max_size):
                        try:
                            reads.append(read_read(fhandler.next(), frags,
                                                   frag_chunk))
                        except KeyError:
                            # Chromosome not in hash
                            continue
                        read_count += 1
                    nfile += 1
                    write_reads_to_file(reads, outfiles[read], tmp_files, nfile)
            except StopIteration:
                fhandler.close()
                nfile += 1
                write_reads_to_file(reads, outfiles[read], tmp_files, nfile)
            windows[read][num] = read_count
            if kwargs.get('compress', False) and fnam.endswith('.map'):
                print 'compressing input MAP file'
                procs.append(Popen(['gzip', fnam]))
        nfile += 1
        write_reads_to_file(reads, outfiles[read], tmp_files, nfile)

        # we have now sorted temporary files
        # we do merge sort for eah pair
        if verbose:
            stdout.write('Merge sort')
            stdout.flush()
        while len(tmp_files) > 1:
            file1 = tmp_files.pop(0)
            try:
                file2 = tmp_files.pop(0)
            except IndexError:
                break
            if verbose:
                stdout.write('.')
            stdout.flush()
            nfile += 1
            tmp_files.append(merge_sort(file1, file2, outfiles[read], nfile))
        if verbose:
            stdout.write('\n')
        tmp_name = tmp_files[0]
        
        if verbose:
            print 'Getting Multiple contacts'
        reads_fh = open(outfiles[read], 'w')
        ## Also pipe file header
        # chromosome sizes (in order)
        reads_fh.write('# Chromosome lengths (order matters):\n')
        for crm in genome_seq:
            reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm])))
        reads_fh.write('# Mapped\treads count by iteration\n')
        for size in windows[read]:
            reads_fh.write('# MAPPED %d %d\n' % (size, windows[read][size]))

        ## Multicontacts
        tmp_reads_fh = open(tmp_name)
        try:
            read_line = tmp_reads_fh.next()
        except StopIteration:
            raise StopIteration('ERROR!\n Nothing parsed, check input files and'
                                ' chromosome names (in genome.fasta and SAM/MAP'
                                ' files).')
        prev_head = read_line.split('\t', 1)[0]
        prev_head = prev_head.split('~' , 1)[0]
        prev_read = read_line
        multis[read] = 0
        for read_line in tmp_reads_fh:
            head = read_line.split('\t', 1)[0]
            head = head.split('~' , 1)[0]
            if head == prev_head:
                multis[read] += 1
                prev_read =  prev_read.strip() + '|||' + read_line
            else:
                reads_fh.write(prev_read)
                prev_read = read_line
            prev_head = head
        reads_fh.write(prev_read)
        reads_fh.close()
        if clean:
            os.system('rm -rf ' + tmp_name)
    # wait for compression to finish
    for p in procs:
        p.communicate()
    return windows, multis
Exemplo n.º 8
0
def parse_map(f_names1, f_names2=None, out_file1=None, out_file2=None,
              genome_seq=None, re_name=None, verbose=False, clean=True,
              **kwargs):
    """
    Parse map files

    Keep a summary of the results into 2 tab-separated files that will contain 6
       columns: read ID, Chromosome, position, strand (either 0 or 1), mapped
       sequence lebgth, position of the closest upstream RE site, position of
       the closest downstream RE site

    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read1, can also  be just one file
    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read2, can also  be just one file
    :param out_file1: path to outfile tab separated format containing mapped
       read1 information
    :param out_file1: path to outfile tab separated format containing mapped
       read2 information
    :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`.
       containing the genomic sequence
    :param re_name: name of the restriction enzyme used
    :param True clean: remove temporary files required for indentification of
       multiple-contacts
    """
    # not nice, dirty fix in order to allow this function to only parse
    # one SAM file
    if not out_file1:
        raise Exception('ERROR: out_file1 should be given\n')
    if not re_name:
        raise Exception('ERROR: re_name should be given\n')
    if not genome_seq:
        raise Exception('ERROR: genome_seq should be given\n')
    if (f_names2 and not out_file2) or (not f_names2 and out_file2):
        raise Exception('ERROR: out_file2 AND f_names2 needed\n')

    frag_chunk = kwargs.get('frag_chunk', 100000)
    if verbose:
        print 'Searching and mapping RE sites to the reference genome'
    frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk,
                         verbose=verbose)

    if isinstance(f_names1, str):
        f_names1 = [f_names1]
    if isinstance(f_names2, str):
        f_names2 = [f_names2]
    if f_names2:
        fnames = f_names1, f_names2
        outfiles = out_file1, out_file2
    else:
        fnames = (f_names1,)
        outfiles = (out_file1, )

    for read in range(len(fnames)):
        if verbose:
            print 'Loading read' + str(read + 1)
        windows = {}
        tmp_name = os.path.join(*outfiles[read].split('/')[:-1] +
                                ['tmp_' + outfiles[read].split('/')[-1]])
        tmp_name = ('/' * outfiles[read].startswith('/')) + tmp_name
        tmp_reads_fh = open(tmp_name, 'w')
        sorter = Popen(['sort', '-k', '1,1', '-s', '-t', '\t'], stdin=PIPE,
                       stdout=tmp_reads_fh)
        num = 0
        for fnam in fnames[read]:
            try:
                fhandler = magic_open(fnam)
            except IOError:
                warn('WARNING: file "%s" not found\n' % fnam)
                continue
            # get the iteration number of the iterative mapping
            try:
                num = int(fnam.split('.')[-1].split(':')[0])
            except:
                num += 1
            windows.setdefault(num, 0)
            if verbose:
                print 'loading file: %s' % (fnam)
            # iteration over reads
            for r in fhandler:
                name, seq, _, _, ali = r.split('\t')[:5]
                crm, strand, pos = ali.split(':')[:3]
                positive = strand == '+'
                len_seq  = len(seq)
                if positive:
                    pos = int(pos)
                else:
                    pos = int(pos) + len_seq - 1 # remove 1 because all inclusive
                try:
                    frag_piece = frags[crm][pos / frag_chunk]
                except KeyError:
                    # Chromosome not in hash
                    continue
                idx = bisect(frag_piece, pos)
                try:
                    next_re = frag_piece[idx]
                except IndexError:
                    # case where part of the read is mapped outside chromosome
                    count = 0
                    while idx >= len(frag_piece) and count < len_seq:
                        pos -= 1
                        count += 1
                        frag_piece = frags[crm][pos / frag_chunk]
                        idx = bisect(frag_piece, pos)
                    if count >= len_seq:
                        raise Exception('Read mapped mostly outside ' +
                                        'chromosome\n')
                    next_re = frag_piece[idx]
                prev_re = frag_piece[idx - 1 if idx else 0]
                sorter.stdin.write('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % (
                    name, crm, pos, positive, len_seq, prev_re, next_re))
                windows[num] += 1
        
        if verbose:
            print 'finishing to sort'
        sorter.communicate()
        tmp_reads_fh.close()

        if verbose:
            print 'Getting Multiple contacts'
        reads_fh = open(outfiles[read], 'w')
        ## Also pipe file header
        # chromosome sizes (in order)
        reads_fh.write('# Chromosome lengths (order matters):\n')
        for crm in genome_seq:
            reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm])))
        reads_fh.write('# Mapped\treads count by iteration\n')
        for size in windows:
            reads_fh.write('# MAPPED %d %d\n' % (size, windows[size]))

        ## Multicontacts
        tmp_reads_fh = open(tmp_name)
        read = tmp_reads_fh.next()
        prev_head = read.split('\t', 1)[0]
        prev_head = prev_head.split('~' , 1)[0]
        prev_read = read
        for read in tmp_reads_fh:
            head = read.split('\t', 1)[0]
            head = head.split('~' , 1)[0]
            if head == prev_head:
                prev_read =  prev_read.strip() + '|||' + read
            else:
                reads_fh.write(prev_read)
                prev_read = read
            prev_head = head
        reads_fh.write(prev_read)
        reads_fh.close()

        if clean:
            os.system('rm -rf ' + tmp_name)
Exemplo n.º 9
0
def parse_sam(f_names1, f_names2, frags, out_file1, out_file2, genome_seq,
              re_name, verbose=False, **kwargs):
    """
    Parse sam/bam file using pysam tools.

    Keep a summary of the results into 2 tab-separated files that will contain 6
       columns: read ID, Chromosome, position, strand (either 0 or 1), mapped
       sequence lebgth, position of the closest upstream RE site, position of
       the closest downstream RE site

    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read1, can also  be just one file
    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read2, can also  be just one file
    :param frags: a dictionary generated by :func:`pyatdbit.mapping.restriction_enzymes.map_re_sites`.

    """
    frags = map_re_sites(re_name, genome_seq, verbose=True)
    frag_chunk = kwargs.get('frag_chunk', 100000)

    fnames = f_names1, f_names2
    outfiles = out_file1, out_file2
    for read in range(2):
        if verbose:
            print 'Loading read' + str(read + 1)
        reads    = []
        for fnam in fnames[read]:
            if verbose:
                print 'loading file:', fnam
            try:
                fhandler = Samfile(fnam)
            except IOError:
                continue
            i = 0
            crm_dict = {}
            while True:
                try:
                    crm_dict[i] = fhandler.getrname(i).replace('chr', '')
                    i += 1
                except ValueError:
                    break
            for r in fhandler:
                if r.is_unmapped:
                    continue
                if r.tags[1][1] != 1:
                    continue
                positive   = not r.is_reverse
                crm        = crm_dict[r.tid]
                len_seq    = len(r.seq)
                pos        = r.pos + (0 if positive else len_seq)
                try:
                    frag_piece = frags[crm][pos / frag_chunk]
                except KeyError:
                    # Chromosome not in hash
                    continue
                idx        = bisect(frag_piece, pos)
                prev_re    = frag_piece[idx - 1]
                next_re    = frag_piece[idx]
                name       = r.qname

                reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % (
                    name, crm, pos, positive, len_seq, prev_re, next_re))
        reads_fh = open(outfiles[read], 'w')
        reads_fh.write(''.join(sorted(reads)))
        reads_fh.close()
    del(reads)
Exemplo n.º 10
0
def parse_sam(f_names1, f_names2=None, out_file1=None, out_file2=None,
              genome_seq=None, re_name=None, verbose=False, clean=True,
              mapper=None, **kwargs):
    """
    Parse sam/bam file using pysam tools.

    Keep a summary of the results into 2 tab-separated files that will contain 6
       columns: read ID, Chromosome, position, strand (either 0 or 1), mapped
       sequence lebgth, position of the closest upstream RE site, position of
       the closest downstream RE site

    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read1, can also  be just one file
    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read2, can also  be just one file
    :param out_file1: path to outfile tab separated format containing mapped
       read1 information
    :param out_file1: path to outfile tab separated format containing mapped
       read2 information
    :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`.
       containing the genomic sequence
    :param re_name: name of the restriction enzyme used
    :param None mapper: software used to map (supported are GEM and BOWTIE2).
       Guessed from file by default.
    """
    # not nice, dirty fix in order to allow this function to only parse
    # one SAM file
    if not out_file1:
        raise Exception('ERROR: out_file1 should be given\n')
    if not re_name:
        raise Exception('ERROR: re_name should be given\n')
    if not genome_seq:
        raise Exception('ERROR: genome_seq should be given\n')
    if (f_names2 and not out_file2) or (not f_names2 and out_file2):
        raise Exception('ERROR: out_file2 AND f_names2 needed\n')

    frag_chunk = kwargs.get('frag_chunk', 100000)
    if verbose:
        print('Searching and mapping RE sites to the reference genome')
    frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk,
                         verbose=verbose)

    if isinstance(f_names1, basestring):
        f_names1 = [f_names1]
    if isinstance(f_names2, basestring):
        f_names2 = [f_names2]
    if f_names2:
        fnames = f_names1, f_names2
        outfiles = out_file1, out_file2
    else:
        fnames = (f_names1,)
        outfiles = (out_file1, )

    # max number of reads per intermediate files for sorting
    max_size = 1000000

    windows = {}
    multis  = {}
    procs   = []
    for read in range(len(fnames)):
        if verbose:
            print('Loading read' + str(read + 1))
        windows[read] = {}
        num = 0
        # iteration over reads
        nfile = 0
        tmp_files = []
        reads     = []
        for fnam in fnames[read]:
            try:
                fhandler = Samfile(fnam)
            except IOError:
                print('WARNING: file "%s" not found' % fnam)
                continue
            except ValueError:
                raise Exception('ERROR: not a SAM/BAM file\n%s' % fnam)
            # get the iteration number of the iterative mapping
            try:
                num = int(fnam.split('.')[-1].split(':')[0])
            except:
                num += 1
            # set read counter
            windows[read].setdefault(num, 0)
            # guess mapper used
            if not mapper:
                mapper = fhandler.header['PG'][0]['ID']
            if mapper.lower()=='gem':
                condition = lambda x: x[1][0][0] != 'N'
            elif mapper.lower() in ['bowtie', 'bowtie2']:
                condition = lambda x: 'XS' in dict(x)
            else:
                warn('WARNING: unrecognized mapper used to generate file\n')
                condition = lambda x: x[1][1] != 1
            if verbose:
                print('loading SAM file from %s: %s' % (mapper, fnam))
            # getrname chromosome names
            i = 0
            crm_dict = {}
            while True:
                try:
                    crm_dict[i] = fhandler.getrname(i)
                    i += 1
                except ValueError:
                    break
            # iteration over reads
            sub_count = 0  # to empty read buffer
            for r in fhandler:
                if r.is_unmapped:
                    continue
                if condition(r.tags):
                    continue
                positive = not r.is_reverse
                crm      = crm_dict[r.tid]
                len_seq  = len(r.seq)
                if positive:
                    pos = r.pos + 1
                else:
                    pos = r.pos + len_seq
                try:
                    frag_piece = frags[crm][pos // frag_chunk]
                except KeyError:
                    # Chromosome not in hash
                    continue
                idx = bisect(frag_piece, pos)
                try:
                    next_re = frag_piece[idx]
                except IndexError:
                    # case where part of the read is mapped outside chromosome
                    count = 0
                    while idx >= len(frag_piece) and count < len_seq:
                        pos -= 1
                        count += 1
                        frag_piece = frags[crm][pos // frag_chunk]
                        idx = bisect(frag_piece, pos)
                    if count >= len_seq:
                        raise Exception('Read mapped mostly outside ' +
                                        'chromosome\n')
                    next_re    = frag_piece[idx]
                prev_re    = frag_piece[idx - 1 if idx else 0]
                name       = r.qname
                reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % (
                    name, crm, pos, positive, len_seq, prev_re, next_re))
                windows[read][num] += 1
                sub_count += 1
                if sub_count >= max_size:
                    sub_count = 0
                    nfile += 1
                    write_reads_to_file(reads, outfiles[read], tmp_files, nfile)
            nfile += 1
            write_reads_to_file(reads, outfiles[read], tmp_files, nfile)


        # we have now sorted temporary files
        # we do merge sort for eah pair
        if verbose:
            stdout.write('Merge sort')
            stdout.flush()
        while len(tmp_files) > 1:
            file1 = tmp_files.pop(0)
            try:
                file2 = tmp_files.pop(0)
            except IndexError:
                break
            if verbose:
                stdout.write('.')
            stdout.flush()
            nfile += 1
            tmp_files.append(merge_sort(file1, file2, outfiles[read], nfile))
        if verbose:
            stdout.write('\n')
        tmp_name = tmp_files[0]
        
        if verbose:
            print('Getting Multiple contacts')
        reads_fh = open(outfiles[read], 'w')
        ## Also pipe file header
        # chromosome sizes (in order)
        reads_fh.write('# Chromosome lengths (order matters):\n')
        for crm in genome_seq:
            reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm])))
        reads_fh.write('# Mapped\treads count by iteration\n')
        for size in windows[read]:
            reads_fh.write('# MAPPED %d %d\n' % (size, windows[read][size]))

        ## Multicontacts
        tmp_reads_fh = open(tmp_name)
        try:
            read_line = next(tmp_reads_fh)
        except StopIteration:
            raise StopIteration('ERROR!\n Nothing parsed, check input files and'
                                ' chromosome names (in genome.fasta and SAM/MAP'
                                ' files).')
        prev_head = read_line.split('\t', 1)[0]
        prev_head = prev_head.split('~' , 1)[0]
        prev_read = read_line
        multis[read] = 0
        for read_line in tmp_reads_fh:
            head = read_line.split('\t', 1)[0]
            head = head.split('~' , 1)[0]
            if head == prev_head:
                multis[read] += 1
                prev_read =  prev_read.strip() + '|||' + read_line
            else:
                reads_fh.write(prev_read)
                prev_read = read_line
            prev_head = head
        reads_fh.write(prev_read)
        reads_fh.close()
        tmp_reads_fh.close()
        if clean:
            os.system('rm -rf ' + tmp_name)
    # wait for compression to finish
    for p in procs:
        p.communicate()
    return windows, multis
Exemplo n.º 11
0
def parse_map(f_names1,
              f_names2=None,
              out_file1=None,
              out_file2=None,
              genome_seq=None,
              re_name=None,
              verbose=False,
              **kwargs):
    """
    Parse map files

    Keep a summary of the results into 2 tab-separated files that will contain 6
       columns: read ID, Chromosome, position, strand (either 0 or 1), mapped
       sequence lebgth, position of the closest upstream RE site, position of
       the closest downstream RE site

    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read1, can also  be just one file
    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read2, can also  be just one file
    :param out_file1: path to outfile tab separated format containing mapped
       read1 information
    :param out_file1: path to outfile tab separated format containing mapped
       read2 information
    :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`.
       containing the genomic sequence
    :param re_name: name of the restriction enzyme used
    """
    # not nice, dirty fix in order to allow this function to only parse
    # one SAM file
    if not out_file1:
        raise Exception('ERROR: out_file1 should be given\n')
    if not re_name:
        raise Exception('ERROR: re_name should be given\n')
    if not genome_seq:
        raise Exception('ERROR: genome_seq should be given\n')
    if (f_names2 and not out_file2) or (not f_names2 and out_file2):
        raise Exception('ERROR: out_file2 AND f_names2 needed\n')

    frag_chunk = kwargs.get('frag_chunk', 100000)
    if verbose:
        print 'Searching and mapping RE sites to the reference genome'
    frags = map_re_sites(re_name,
                         genome_seq,
                         frag_chunk=frag_chunk,
                         verbose=verbose)

    if isinstance(f_names1, str):
        f_names1 = [f_names1]
    if isinstance(f_names2, str):
        f_names2 = [f_names2]
    if f_names2:
        fnames = f_names1, f_names2
        outfiles = out_file1, out_file2
    else:
        fnames = (f_names1, )
        outfiles = (out_file1, )

    for read in range(len(fnames)):
        if verbose:
            print 'Loading read' + str(read + 1)
        windows = {}
        tmp_name = '/'.join(outfiles[read].split('/')
                            [:-1]) + '/tmp_' + outfiles[read].split('/')[-1]
        tmp_reads_fh = open(tmp_name, 'w')
        sorter = Popen(['sort', '-k', '1,1', '-s', '-t', '\t'],
                       stdin=PIPE,
                       stdout=tmp_reads_fh)
        num = 0
        for fnam in fnames[read]:
            try:
                fhandler = magic_open(fnam)
            except IOError:
                warn('WARNING: file "%s" not found\n' % fnam)
                continue
            # get the iteration number of the iterative mapping
            try:
                num = int(fnam.split('.')[-1].split(':')[0])
            except:
                num += 1
            windows.setdefault(num, 0)
            if verbose:
                print 'loading file: %s' % (fnam)
            # iteration over reads
            for r in fhandler:
                name, seq, _, _, ali = r.split('\t')[:5]
                crm, strand, pos = ali.split(':')[:3]
                positive = strand == '+'
                len_seq = len(seq)
                if positive:
                    pos = int(pos)
                else:
                    pos = int(
                        pos) + len_seq - 1  # remove 1 because all inclusive
                try:
                    frag_piece = frags[crm][pos / frag_chunk]
                except KeyError:
                    # Chromosome not in hash
                    continue
                idx = bisect(frag_piece, pos)
                try:
                    next_re = frag_piece[idx]
                except IndexError:
                    # case where part of the read is mapped outside chromosome
                    count = 0
                    while idx >= len(frag_piece) and count < len_seq:
                        pos -= 1
                        count += 1
                        frag_piece = frags[crm][pos / frag_chunk]
                        idx = bisect(frag_piece, pos)
                    if count >= len_seq:
                        raise Exception('Read mapped mostly outside ' +
                                        'chromosome\n')
                    next_re = frag_piece[idx]
                prev_re = frag_piece[idx - 1 if idx else 0]
                sorter.stdin.write(
                    '%s\t%s\t%d\t%d\t%d\t%d\t%d\n' %
                    (name, crm, pos, positive, len_seq, prev_re, next_re))
                windows[num] += 1

        if verbose:
            print 'finishing to sort'
        sorter.communicate()
        tmp_reads_fh.close()

        if verbose:
            print 'Getting Multiple contacts'
        reads_fh = open(outfiles[read], 'w')
        ## Also pipe file header
        # chromosome sizes (in order)
        reads_fh.write('# Chromosome lengths (order matters):\n')
        for crm in genome_seq:
            reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm])))
        reads_fh.write('# Mapped\treads count by iteration\n')
        for size in windows:
            reads_fh.write('# MAPPED %d %d\n' % (size, windows[size]))

        ## Multicontacts
        tmp_reads_fh = open(tmp_name)
        read = tmp_reads_fh.next()
        prev_head = read.split('\t', 1)[0]
        prev_read = read.strip()
        for read in tmp_reads_fh:
            head = read.split('\t', 1)[0]
            if head == prev_head:
                prev_read += '|||' + read.strip()
            else:
                reads_fh.write(prev_read + '\n')
                prev_read = read.strip()
            prev_head = head
        reads_fh.write(prev_read + '\n')
        reads_fh.close()
Exemplo n.º 12
0
def parse_sam(f_names1, f_names2=None, out_file1=None, out_file2=None,
              genome_seq=None, re_name=None, verbose=False, mapper=None,
              **kwargs):
    """
    Parse sam/bam file using pysam tools.

    Keep a summary of the results into 2 tab-separated files that will contain 6
       columns: read ID, Chromosome, position, strand (either 0 or 1), mapped
       sequence lebgth, position of the closest upstream RE site, position of
       the closest downstream RE site

    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read1, can also  be just one file
    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read2, can also  be just one file
    :param out_file1: path to outfile tab separated format containing mapped
       read1 information
    :param out_file1: path to outfile tab separated format containing mapped
       read2 information
    :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`.
       containing the genomic sequence
    :param re_name: name of the restriction enzyme used
    :param None mapper: software used to map (supported are GEM and BOWTIE2).
       Guessed from file by default.
    """
    # not nice, dirty fix in order to allow this function to only parse
    # one SAM file
    if not out_file1:
        raise Exception('ERROR: out_file1 should be given\n')
    if not re_name:
        raise Exception('ERROR: re_name should be given\n')
    if not genome_seq:
        raise Exception('ERROR: genome_seq should be given\n')
    if (f_names2 and not out_file2) or (not f_names2 and out_file2):
        raise Exception('ERROR: out_file2 AND f_names2 needed\n')

    frag_chunk = kwargs.get('frag_chunk', 100000)
    if verbose:
        print 'Searching and mapping RE sites to the reference genome'
    frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk,
                         verbose=verbose)

    if isinstance(f_names1, str):
        f_names1 = [f_names1]
    if isinstance(f_names2, str):
        f_names2 = [f_names2]
    if f_names2:
        fnames = f_names1, f_names2
        outfiles = out_file1, out_file2
    else:
        fnames = (f_names1,)
        outfiles = (out_file1, )

    for read in range(len(fnames)):
        if verbose:
            print 'Loading read' + str(read + 1)
        reads    = []
        for fnam in fnames[read]:
            if verbose:
                print 'loading file:', fnam
            try:
                fhandler = Samfile(fnam)
            except IOError:
                continue
            # guess mapper used
            if not mapper:
                mapper = fhandler.header['PG'][0]['ID']
            if mapper.lower()=='gem':
                condition = lambda x: x[1][1] != 1
            elif mapper.lower() in ['bowtie', 'bowtie2']:
                condition = lambda x: 'XS' in dict(x)
            else:
                warn('WARNING: unrecognized mapper used to generate file\n')
                condition = lambda x: x[1][1] != 1
            if verbose:
                print 'MAPPER:', mapper
            # iteration over reads
            i = 0
            crm_dict = {}
            while True:
                try:
                    crm_dict[i] = fhandler.getrname(i)
                    i += 1
                except ValueError:
                    break
            for r in fhandler:
                if r.is_unmapped:
                    continue
                if condition(r.tags):
                    continue
                positive = not r.is_reverse
                crm      = crm_dict[r.tid]
                len_seq  = len(r.seq)
                pos      = r.pos + (0 if positive else len_seq)
                try:
                    frag_piece = frags[crm][pos / frag_chunk]
                except KeyError:
                    # Chromosome not in hash
                    continue
                idx        = bisect(frag_piece, pos)
                try:
                    next_re    = frag_piece[idx]
                except IndexError:
                    # case where part of the read is mapped outside chromosome
                    count = 0
                    while idx >= len(frag_piece) and count < len_seq:
                        pos -= 1
                        count += 1
                        frag_piece = frags[crm][pos / frag_chunk]
                        idx        = bisect(frag_piece, pos)
                    if count >= len_seq:
                        raise Exception('Read mapped mostly outside ' +
                                        'chromosome\n')
                    next_re    = frag_piece[idx]
                prev_re    = frag_piece[idx - 1]
                name       = r.qname

                reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % (
                    name, crm, pos, positive, len_seq, prev_re, next_re))
        reads_fh = open(outfiles[read], 'w')
        ## write file header
        # chromosome sizes (in order)
        reads_fh.write('## Chromosome lengths (order matters):\n')
        for crm in genome_seq:
            reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm])))
        reads_fh.write(''.join(sorted(reads)))
        reads_fh.close()
    del(reads)
Exemplo n.º 13
0
def parse_sam(f_names1, f_names2=None, out_file1=None, out_file2=None,
              genome_seq=None, re_name=None, verbose=False, mapper=None,
              **kwargs):
    """
    Parse sam/bam file using pysam tools.

    Keep a summary of the results into 2 tab-separated files that will contain 6
       columns: read ID, Chromosome, position, strand (either 0 or 1), mapped
       sequence lebgth, position of the closest upstream RE site, position of
       the closest downstream RE site

    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read1, can also  be just one file
    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read2, can also  be just one file
    :param out_file1: path to outfile tab separated format containing mapped
       read1 information
    :param out_file1: path to outfile tab separated format containing mapped
       read2 information
    :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`.
       containing the genomic sequence
    :param re_name: name of the restriction enzyme used
    :param None mapper: software used to map (supported are GEM and BOWTIE2).
       Guessed from file by default.
    """
    # not nice, dirty fix in order to allow this function to only parse
    # one SAM file
    if not out_file1:
        raise Exception('ERROR: out_file1 should be given\n')
    if not re_name:
        raise Exception('ERROR: re_name should be given\n')
    if not genome_seq:
        raise Exception('ERROR: genome_seq should be given\n')
    if (f_names2 and not out_file2) or (not f_names2 and out_file2):
        raise Exception('ERROR: out_file2 AND f_names2 needed\n')

    frag_chunk = kwargs.get('frag_chunk', 100000)
    if verbose:
        print 'Searching and mapping RE sites to the reference genome'
    frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk,
                         verbose=verbose)

    if isinstance(f_names1, str):
        f_names1 = [f_names1]
    if isinstance(f_names2, str):
        f_names2 = [f_names2]
    if f_names2:
        fnames = f_names1, f_names2
        outfiles = out_file1, out_file2
    else:
        fnames = (f_names1,)
        outfiles = (out_file1, )

    for read in range(len(fnames)):
        if verbose:
            print 'Loading read' + str(read + 1)
        windows = {}
        reads    = []
        num = 0
        for fnam in fnames[read]:
            try:
                fhandler = Samfile(fnam)
            except IOError:
                print 'WARNING: file "%s" not found' % fnam
                continue
            except ValueError:
                raise Exception('ERROR: not a SAM/BAM file\n%s' % fnam)
            # get the iteration number of the iterative mapping
            try:
                num = int(fnam.split('.')[-1].split(':')[0])
            except:
                num += 1
            windows.setdefault(num, 0)
            # guess mapper used
            if not mapper:
                mapper = fhandler.header['PG'][0]['ID']
            if mapper.lower()=='gem':
                condition = lambda x: x[1][1] != 1
            elif mapper.lower() in ['bowtie', 'bowtie2']:
                condition = lambda x: 'XS' in dict(x)
            else:
                warn('WARNING: unrecognized mapper used to generate file\n')
                condition = lambda x: x[1][1] != 1
            if verbose:
                print 'loading %s file: %s' % (mapper, fnam)
            # iteration over reads
            i = 0
            crm_dict = {}
            while True:
                try:
                    crm_dict[i] = fhandler.getrname(i)
                    i += 1
                except ValueError:
                    break
            for r in fhandler:
                if r.is_unmapped:
                    continue
                if condition(r.tags):
                    continue
                positive = not r.is_reverse
                crm      = crm_dict[r.tid]
                len_seq  = len(r.seq)
                if positive:
                    pos = r.pos + 1
                else:
                    pos = r.pos + len_seq + 1
                try:
                    frag_piece = frags[crm][pos / frag_chunk]
                except KeyError:
                    # Chromosome not in hash
                    continue
                idx = bisect(frag_piece, pos)
                try:
                    next_re = frag_piece[idx]
                except IndexError:
                    # case where part of the read is mapped outside chromosome
                    count = 0
                    while idx >= len(frag_piece) and count < len_seq:
                        pos -= 1
                        count += 1
                        frag_piece = frags[crm][pos / frag_chunk]
                        idx = bisect(frag_piece, pos)
                    if count >= len_seq:
                        raise Exception('Read mapped mostly outside ' +
                                        'chromosome\n')
                    next_re    = frag_piece[idx]
                prev_re    = frag_piece[idx - 1 if idx else 0]
                name       = r.qname
                reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % (
                    name, crm, pos, positive, len_seq, prev_re, next_re))
                windows[num] += 1
        reads_fh = open(outfiles[read], 'w')
        ## write file header
        # chromosome sizes (in order)
        reads_fh.write('## Chromosome lengths (order matters):\n')
        for crm in genome_seq:
            reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm])))
        reads_fh.write('## Number of mapped reads by iteration\n')
        for size in windows:
            reads_fh.write('# MAPPED %d %d\n' % (size, windows[size]))
        reads_fh.write(''.join(sorted(reads)))
        reads_fh.close()
    del reads
Exemplo n.º 14
0
def parse_sam(f_names1,
              f_names2=None,
              out_file1=None,
              out_file2=None,
              genome_seq=None,
              re_name=None,
              verbose=False,
              mapper=None,
              ncpus=1,
              **kwargs):
    """
    Parse sam/bam file using pysam tools.

    Keep a summary of the results into 2 tab-separated files that will contain 6
       columns: read ID, Chromosome, position, strand (either 0 or 1), mapped
       sequence lebgth, position of the closest upstream RE site, position of
       the closest downstream RE site

    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read1, can also  be just one file
    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read2, can also  be just one file
    :param out_file1: path to outfile tab separated format containing mapped
       read1 information
    :param out_file1: path to outfile tab separated format containing mapped
       read2 information
    :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`.
       containing the genomic sequence
    :param re_name: name of the restriction enzyme used
    :param None mapper: software used to map (supported are GEM and BOWTIE2).
       Guessed from file by default.
    """
    # not nice, dirty fix in order to allow this function to only parse
    # one SAM file
    if not out_file1:
        raise Exception('ERROR: out_file1 should be given\n')
    if not re_name:
        raise Exception('ERROR: re_name should be given\n')
    if not genome_seq:
        raise Exception('ERROR: genome_seq should be given\n')
    if (f_names2 and not out_file2) or (not f_names2 and out_file2):
        raise Exception('ERROR: out_file2 AND f_names2 needed\n')

    frag_chunk = kwargs.get('frag_chunk', 100000)
    if verbose:
        print 'Searching and mapping RE sites to the reference genome'
    frags = map_re_sites(re_name,
                         genome_seq,
                         frag_chunk=frag_chunk,
                         verbose=verbose)

    if isinstance(f_names1, str):
        f_names1 = [f_names1]
    if isinstance(f_names2, str):
        f_names2 = [f_names2]
    if f_names2:
        fnames = f_names1, f_names2
        outfiles = out_file1, out_file2
    else:
        fnames = (f_names1, )
        outfiles = (out_file1, )

    sorting = []
    for read in range(len(fnames)):
        if verbose:
            print 'Loading read' + str(read + 1) + '\n',
        pool = mu.Pool(ncpus)
        jobs = []
        num = 0
        for fnam in fnames[read]:
            num += 1
            jobs.append(
                pool.apply_async(_read_one_sam,
                                 args=(fnam, mapper, verbose, frags,
                                       frag_chunk, num)))
        pool.close()
        pool.join()
        windows = {}
        for w in jobs:
            w = w.get()
            for k in w:
                windows.setdefault(k, 0)
                windows[k] += w[k]

        reads_fh = open(outfiles[read], 'w')
        ## write file header
        # chromosome sizes (in order)
        reads_fh.write('## Chromosome lengths (order matters):\n')
        for crm in genome_seq:
            reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm])))
        reads_fh.write('## Number of mapped reads by iteration\n')
        for size in windows:
            reads_fh.write('# MAPPED %d %d\n' % (size, windows[size]))
        reads_fh.close()

        # write the rest of the file using bash to concatenate and sort
        # done asynchromeously
        list_tsv = ' '.join([fnam + '.tsv' for fnam in fnames[read]])
        sort = ('sort -k1,1 %s ' % list_tsv)
        sorting.append(Popen(sort + '>> ' + outfiles[read], shell=True))
    if verbose:
        print 'Sorting reads'
    for s in sorting:
        if s.wait() > 0:
            raise Exception('ERROR: problem sorting file\n')
    if verbose:
        print 'Removing temporary files...'
    for read in range(len(fnames)):
        list_tsv = ' '.join([fnam + '.tsv' for fnam in fnames[read]])
        os.system('rm -f %s' % list_tsv)
Exemplo n.º 15
0
def parse_sam(f_names1, f_names2=None, out_file1=None, out_file2=None,
              genome_seq=None, re_name=None, verbose=False, mapper=None,
              ncpus=1, **kwargs):
    """
    Parse sam/bam file using pysam tools.

    Keep a summary of the results into 2 tab-separated files that will contain 6
       columns: read ID, Chromosome, position, strand (either 0 or 1), mapped
       sequence lebgth, position of the closest upstream RE site, position of
       the closest downstream RE site

    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read1, can also  be just one file
    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read2, can also  be just one file
    :param out_file1: path to outfile tab separated format containing mapped
       read1 information
    :param out_file1: path to outfile tab separated format containing mapped
       read2 information
    :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`.
       containing the genomic sequence
    :param re_name: name of the restriction enzyme used
    :param None mapper: software used to map (supported are GEM and BOWTIE2).
       Guessed from file by default.
    """
    # not nice, dirty fix in order to allow this function to only parse
    # one SAM file
    if not out_file1:
        raise Exception('ERROR: out_file1 should be given\n')
    if not re_name:
        raise Exception('ERROR: re_name should be given\n')
    if not genome_seq:
        raise Exception('ERROR: genome_seq should be given\n')
    if (f_names2 and not out_file2) or (not f_names2 and out_file2):
        raise Exception('ERROR: out_file2 AND f_names2 needed\n')

    frag_chunk = kwargs.get('frag_chunk', 100000)
    if verbose:
        print 'Searching and mapping RE sites to the reference genome'
    frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk,
                         verbose=verbose)

    if isinstance(f_names1, str):
        f_names1 = [f_names1]
    if isinstance(f_names2, str):
        f_names2 = [f_names2]
    if f_names2:
        fnames = f_names1, f_names2
        outfiles = out_file1, out_file2
    else:
        fnames = (f_names1,)
        outfiles = (out_file1, )

    sorting = []
    for read in range(len(fnames)):
        if verbose:
            print 'Loading read' + str(read + 1) + '\n',
        pool = mu.Pool(ncpus)
        jobs = []
        num = 0
        for fnam in fnames[read]:
            num += 1
            jobs.append(pool.apply_async(_read_one_sam,
                                         args=(fnam, mapper, verbose, frags,
                                               frag_chunk, num)))
        pool.close()
        pool.join()
        windows = {}
        for w in jobs:
            w = w.get()
            for k in w:
                windows.setdefault(k, 0)
                windows[k] += w[k]

        reads_fh = open(outfiles[read], 'w')
        ## write file header
        # chromosome sizes (in order)
        reads_fh.write('## Chromosome lengths (order matters):\n')
        for crm in genome_seq:
            reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm])))
        reads_fh.write('## Number of mapped reads by iteration\n')
        for size in windows:
            reads_fh.write('# MAPPED %d %d\n' % (size, windows[size]))
        reads_fh.close()

        # write the rest of the file using bash to concatenate and sort
        # done asynchromeously
        list_tsv = ' '.join([fnam + '.tsv' for fnam in fnames[read]])
        sort = ('sort -k1,1 %s ' % list_tsv)
        sorting.append(Popen(sort + '>> ' + outfiles[read], shell=True))
    if verbose:
        print 'Sorting reads'
    for s in sorting:
        if s.wait() > 0:
            raise Exception('ERROR: problem sorting file\n')
    if verbose:
        print 'Removing temporary files...'
    for read in range(len(fnames)):
        list_tsv = ' '.join([fnam + '.tsv' for fnam in fnames[read]])
        os.system('rm -f %s' % list_tsv)
Exemplo n.º 16
0
def fast_fragment_mapping(mapper_index_path,
                          fastq_path1,
                          fastq_path2,
                          r_enz,
                          genome_seq,
                          out_map,
                          clean=True,
                          get_nread=False,
                          mapper_binary=None,
                          mapper_params=None,
                          samtools='samtools',
                          **kwargs):
    """
    Maps FASTQ reads to an indexed reference genome with the knowledge of
    the restriction enzyme used (fragment-based mapping).

    :param mapper_index_path: path to index file created from a reference genome
       using gem-index tool, bowtie2-build or hisat2-build
    :param fastq_path1: PATH to FASTQ file of read 1, either compressed or not.
    :param fastq_path2: PATH to FASTQ file of read 2, either compressed or not.
    :param out_map_dir: path to outfile tab separated format containing mapped
       read information.
    :param r_enz: name of the restriction enzyme used in the experiment e.g.
       HindIII.
    :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`.
       containing the genomic sequence
    :param False clean: remove intermediate files created in temp_dir
    :param False get_nread: returns a list of lists where each element contains
       a path and the number of reads processed
    :param 4 nthreads: number of threads to use for mapping (number of CPUs)
    :param /tmp temp_dir: important to change. Intermediate FASTQ files will be
       written there.
    :param gem-mapper mapper_binary: path to the binary mapper
    :param None mapper_params: extra parameters for the mapper
    :param samtools samtools: path to samtools binary.

    :returns: outfile with the intersected read pairs
    """

    suffix = kwargs.get('suffix', '')
    suffix = ('_' * (suffix != '')) + suffix
    nthreads = kwargs.get('nthreads', 8)
    samtools = which(samtools)
    # check out folder
    if not os.path.isdir(os.path.dirname(os.path.abspath(out_map))):
        raise Exception(
            '\n\nERROR: Path to store the output does not exist.\n')
    temp_dir = os.path.abspath(
        os.path.expanduser(kwargs.get('temp_dir', gettempdir())))
    gem_version = None
    # check that we have the GEM binary:
    gem_binary = mapper_binary or 'gem-mapper'
    gem_binary = which(gem_binary)
    if not gem_binary:
        raise Exception('\n\nERROR: GEM v3 binary not found, install it from:'
                        '\nhttps://github.com/smarco/gem3-mapper'
                        'Copy the binary gem-mapper to /usr/local/bin/ for '
                        'example (somewhere in your PATH).\n')
    try:
        out, err = Popen([gem_binary, '--version'],
                         stdout=PIPE,
                         stderr=STDOUT,
                         universal_newlines=True).communicate()
        gem_version = int(out[1])
    except ValueError as e:
        gem_version = 2
        print('Falling to gem v2')
    if gem_version < 3:
        raise Exception('\n\nERROR: GEM v3 binary not found, install it from:'
                        '\nhttps://github.com/smarco/gem3-mapper'
                        'Copy the binary gem-mapper to /usr/local/bin/ for '
                        'example (somewhere in your PATH).\n')
    if mapper_params:
        kwargs.update(mapper_params)
    # create directories
    for rep in [temp_dir]:
        mkdir(rep)
    # check space
    fspace = int(get_free_space_mb(temp_dir, div=3))
    if fspace < 200:
        warn('WARNING: only %d Gb left on tmp_dir: %s\n' % (fspace, temp_dir))

    # iterative mapping
    base_name1 = os.path.split(fastq_path1)[-1].replace('.gz', '')
    base_name1 = '.'.join(base_name1.split('.')[:-1])

    curr_map1, _ = transform_fastq(fastq_path1,
                                   mkstemp(prefix=base_name1 + '_',
                                           dir=temp_dir)[1],
                                   fastq=is_fastq(fastq_path1),
                                   nthreads=nthreads,
                                   light_storage=True)

    base_name2 = os.path.split(fastq_path2)[-1].replace('.gz', '')
    base_name2 = '.'.join(base_name2.split('.')[:-1])

    curr_map2, count_fastq = transform_fastq(fastq_path2,
                                             mkstemp(prefix=base_name2 + '_',
                                                     dir=temp_dir)[1],
                                             fastq=is_fastq(fastq_path1),
                                             nthreads=nthreads,
                                             light_storage=True)

    out_map_path = curr_map1 + '_frag%s.map' % (suffix)

    print('Mapping fragments of remaining reads...')
    _gem_mapping(mapper_index_path,
                 curr_map1,
                 out_map_path,
                 fastq_path2=curr_map2,
                 r_enz=r_enz,
                 gem_binary=gem_binary,
                 gem_version=gem_version,
                 **kwargs)
    # clean
    if clean:
        print('   x removing GEM 3 input %s' % (curr_map1))
        os.system('rm -f %s' % (curr_map1))
        print('   x removing GEM 3 input %s' % (curr_map2))
        os.system('rm -f %s' % (curr_map2))

    #sort sam file
    os.system(samtools + ' sort -n -O SAM -@ %d -T %s -o %s %s' %
              (nthreads, out_map_path, out_map_path, out_map_path))
    genome_lengths = dict((crm, len(genome_seq[crm])) for crm in genome_seq)
    frag_chunk = kwargs.get('frag_chunk', 100000)
    frags = map_re_sites(r_enz, genome_seq, frag_chunk=frag_chunk)
    if samtools and nthreads > 1:
        print('Splitting sam file')
        # headers
        for i in range(nthreads):
            os.system(samtools + ' view -H -O SAM %s > "%s_%d"' %
                      (out_map_path, out_map_path, (i + 1)))
        chunk_lines = int(
            (count_fastq * 2.3) /
            nthreads)  # estimate lines in sam with reads and frags
        os.system(samtools + ''' view -O SAM %s | awk -v n=%d -v FS="\\t" '
              BEGIN { part=0; line=n }       
              { if( line>=n && $1!=last_read ) {part++; line=1; print $0 >> "%s_"part } 
                else { print $0 >> "%s_"part; line++; } 
                last_read = $1;
              }'
        ''' % (out_map_path, chunk_lines, out_map_path, out_map_path))
        if clean:
            print('   x removing tmp mapped %s' % out_map_path)
            os.system('rm -f %s' % (out_map_path))
        print('Parsing results...')
        kwargs['nthreads'] = 1
        procs = []
        pool = mu.Pool(nthreads)
        for i in range(nthreads):
            frags_shared = copy.deepcopy(frags)
            procs.append(
                pool.apply_async(parse_gem_3c,
                                 args=('%s_%d' % (out_map_path, (i + 1)),
                                       '%s_parsed_%d' % (out_map_path,
                                                         (i + 1)),
                                       copy.deepcopy(genome_lengths),
                                       frags_shared, False, True),
                                 kwds=kwargs))
            #results.append('%s_parsed_%d' % (out_map_path,(i+1)))
        pool.close()
        pool.join()
        results = [proc.get() for proc in procs if proc.get()]
        if clean:
            for i in range(nthreads):
                print('   x removing tmp mapped %s_%d' % (out_map_path,
                                                          (i + 1)))
                os.system('rm -f %s_%d' % (out_map_path, (i + 1)))

        #Final sort and merge
        nround = 0
        while len(results) > 1:
            nround += 1
            num_procs = min(nthreads, int(len(results) / 2))
            pool = mu.Pool(num_procs)
            procs = [
                pool.apply_async(merge_sort,
                                 (results.pop(0), results.pop(0),
                                  out_map_path + '_%d' % nround, i, True))
                for i in range(num_procs)
            ]
            pool.close()
            pool.join()
            results = [proc.get() for proc in procs if proc.get()]

        map_out = open(out_map, 'w')
        tmp_reads_fh = open(results[0], 'r')
        for crm in genome_seq:
            map_out.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm])))
        for read_line in tmp_reads_fh:
            read = read_line.split('\t')
            map_out.write('\t'.join([read[0]] + read[2:8] + read[9:]))
        map_out.close()
        if clean:
            print('   x removing tmp mapped %s' % results[0])
            os.system('rm -f %s' % (results[0]))

    else:
        print('Parsing result...')
        parse_gem_3c(out_map_path,
                     out_map,
                     genome_lengths,
                     frags,
                     verbose=False,
                     tmp_format=False,
                     **kwargs)

        # clean
        if clean:
            print('   x removing tmp mapped %s' % out_map_path)
            os.system('rm -f %s' % (out_map_path))

    if get_nread:
        return [(out_map, count_fastq)]
    return out_map