示例#1
0
文件: bam_aln.py 项目: m1m0r1/galgo
def bam_variant_aln(args):
    samfile = Samfile(args.bam)
    for rec in args.vcf:
        fp = open(vcf)
        reader = pyvcf.Reader(fp)
        self.positions = []

    for rec in samfile.fetch(vcf):
        samfile.getrname(rec.tid)
        rec
示例#2
0
def single_end_sam_parsing(sam_list, cov, identity_threshold):
    match = {}
    to_process = []
    if sam_list[0] is None:
        print "The ene-to-end mapping of SE data produced an error."
    else:
        to_process.append(sam_list[0])
    if sam_list[1] is None:
        print "The local mapping mode of SE data  produced an error."
    else:
        to_process.append(sam_list[1])
    for single_sam in to_process:
        sam = Samfile(single_sam)
        for align in sam:
            if align.tid != -1:
                query_name, query_len, ref_name = align.qname, float(
                    align.rlen), sam.getrname(align.tid)
                if align.cigar is not None:
                    align_len, query_aligned_len = cigar_parsing(align.cigar)
                    nm = -1
                    if (query_aligned_len / query_len) * 100 >= cov:
                        for coppia in align.tags:
                            if coppia[0] == "NM":
                                nm = float(coppia[1])
                    if align_len != 0 and nm >= 0:
                        paired_perc_id = ((align_len - nm) / align_len) * 100
                        if paired_perc_id >= identity_threshold:
                            match.setdefault(query_name, set())
                            match[query_name].add(ref_name)
        sam.close()
    return match
示例#3
0
def main(args):
    option = "r" if args.samformat else "rb"
    samfile = Samfile(args.bamfile, "rb")

    #Iterates over each read instead of each contig
    outputs = defaultdict(list)
    #import ipdb; ipdb.set_trace()
    for aln in samfile.fetch(until_eof = True):
        ref = samfile.getrname(aln.tid)
        outputs[ref].append(aln)

    for ref, alns in outputs.iteritems():
        print_reads(alns, ref, samfile.header)
def main():

    bam  = Samfile("bedtools/tests/data/NA18152.bam", "rb")
    rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed")
    
    for al in bam:
        chrom = bam.getrname(al.rname)
        start = al.pos
        end   = al.aend
        name  = al.qname 
        for hit in rmsk.search(chrom, start, end):
            print chrom, start, end, name,
            print hit.chrom, hit.start, hit.end, hit.name
示例#5
0
def paired_end_sam_parsing(sam_list, cov, identity_threshold):
    match = {}
    to_process = []
    if sam_list[0] is None:
        print "The ene-to-end mapping of SE data produced an error."
    else:
        to_process.append(sam_list[0])
    if sam_list[1] is None:
        print "The local mapping mode of SE data  produced an error."
    else:
        to_process.append(sam_list[1])
    for paired_sam in to_process:
        r1_match = {}
        r2_match = {}
        sam = Samfile(paired_sam)
        for align in sam:
            if align.tid != -1:
                query_name, query_len, ref_name = align.qname, float(
                    align.rlen), sam.getrname(align.tid)
                if align.cigar is not None:
                    align_len, query_aligned_len = cigar_parsing(align.cigar)
                    # print query_name, align_len, query_aligned_len
                    nm = -1
                    if (query_aligned_len / query_len) * 100 >= cov:
                        for coppia in align.tags:
                            if coppia[0] == "NM":
                                nm = float(coppia[1])
                    if align_len != 0 and nm >= 0:
                        paired_perc_id = ((align_len - nm) / align_len) * 100
                        if paired_perc_id >= 90:
                            if align.is_read1:
                                r1_match.setdefault(query_name, {})
                                r1_match[query_name].setdefault(ref_name, [])
                                r1_match[query_name][ref_name].append(
                                    paired_perc_id)
                            if align.is_read2:
                                r2_match.setdefault(query_name, {})
                                r2_match[query_name].setdefault(ref_name, [])
                                r2_match[query_name][ref_name].append(
                                    paired_perc_id)
        sam.close()
        for query in set(r1_match.keys()).intersection(set(r2_match.keys())):
            for ref in set(r1_match[query].keys()).intersection(
                    r2_match[query].keys()):
                average_perc_id = calcola_media(
                    [max(r1_match[query][ref]),
                     max(r2_match[query][ref])])
                if average_perc_id >= identity_threshold:
                    match.setdefault(query, set())
                    match[query].add(ref)
    return match
示例#6
0
def main():

    bam = Samfile("bedtools/tests/data/NA18152.bam", "rb")
    rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed")

    # Example 1:
    #    Method: IntervalFile.all_hits()
    #    Report _all_ of the rmsk features that overlap with the BAM alignment
    for al in bam:
        strand = "+"
        if al.is_reverse: strand = "-"
        i = Interval(bam.getrname(al.rname), al.pos, al.aend, strand)

        for hit in rmsk.all_hits(i, same_strand=True, ovlp_pct=0.75):
            print "\t".join(str(x) for x in [i, hit])
def main():

    bam  = Samfile("bedtools/tests/data/NA18152.bam", "rb")
    rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed")
    
    # Example 1:
    #    Method: IntervalFile.all_hits()
    #    Report _all_ of the rmsk features that overlap with the BAM alignment
    for al in bam:
        strand = "+"
        if al.is_reverse: strand = "-"
        i = Interval(bam.getrname(al.rname), al.pos, al.aend, strand)
        
        for hit in rmsk.all_hits(i, same_strand=True, ovlp_pct=0.75):
            print "\t".join(str(x) for x in [i,hit])
示例#8
0
def _bowtie2_filter(fnam, fastq_path, unmap_out, map_out):
    """
    Divides reads in a map file in two categories: uniquely mapped, and not.
    Writes them in two files

    """
    try:
        fhandler = Samfile(fnam)
    except IOError:
        raise Exception('ERROR: file "%s" not found' % fnam)
    # getrname chromosome names
    i = 0
    crm_dict = {}
    while True:
        try:
            crm_dict[i] = fhandler.getrname(i)
            i += 1
        except ValueError:
            break
    # iteration over reads
    unmap_out = open(unmap_out, 'w')
    map_out   = open(map_out, 'w')
    fastq_in  = open(fastq_path , 'r')
    for line in fhandler:
        line_in = fastq_in.readline()
        if line.is_unmapped or line.mapq < 4:
            read = '%s\t%s\t%s\t%s\t%s\n' % (
                line_in.split('\t', 1)[0].rstrip('\n')[1:],
                line.seq, line.qual, '-', '-'
                )
            unmap_out.write(read)
        else:
            read = '%s\t%s\t%s\t%s\t%s:%s:%d:%d\n' % (
                line.qname, line.seq, line.qual, '1',
                crm_dict[line.tid],
                '-' if line.is_reverse else '+', line.pos + 1, len(line.seq))
            map_out.write(read)
        for _ in range(3):
            fastq_in.readline()
    unmap_out.close()
    map_out.close()
    fastq_in.close()
示例#9
0
def _sam_filter(fnam, fastq_path, unmap_out, map_out):
    """
    Divides reads in a map file in two categories: uniquely mapped, and not.
    Writes them in two files

    """
    try:
        fhandler = Samfile(fnam)
    except IOError:
        raise Exception('ERROR: file "%s" not found' % fnam)
    # getrname chromosome names
    i = 0
    crm_dict = {}
    while True:
        try:
            crm_dict[i] = fhandler.getrname(i)
            i += 1
        except ValueError:
            break
    # iteration over reads
    unmap_out = open(unmap_out, 'w')
    map_out = open(map_out, 'w')
    fastq_in = open(fastq_path, 'r')
    for line in fhandler:
        line_in = fastq_in.readline()
        if line.is_unmapped or line.mapq < 4:
            read = '%s\t%s\t%s\t%s\t%s\n' % (line_in.split(
                '\t', 1)[0].rstrip('\n')[1:], line.seq, line.qual, '-', '-')
            unmap_out.write(read)
        else:
            read = '%s\t%s\t%s\t%s\t%s:%s:%d:%d\n' % (
                line.qname, line.seq, line.qual, '1', crm_dict[line.tid],
                '-' if line.is_reverse else '+', line.pos + 1, len(line.seq))
            map_out.write(read)
        for _ in range(3):
            fastq_in.readline()
    unmap_out.close()
    map_out.close()
    fastq_in.close()
示例#10
0
def parse_sam(f_names1, f_names2=None, out_file1=None, out_file2=None,
              genome_seq=None, re_name=None, verbose=False, clean=True,
              mapper=None, **kwargs):
    """
    Parse sam/bam file using pysam tools.

    Keep a summary of the results into 2 tab-separated files that will contain 6
       columns: read ID, Chromosome, position, strand (either 0 or 1), mapped
       sequence lebgth, position of the closest upstream RE site, position of
       the closest downstream RE site

    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read1, can also  be just one file
    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read2, can also  be just one file
    :param out_file1: path to outfile tab separated format containing mapped
       read1 information
    :param out_file1: path to outfile tab separated format containing mapped
       read2 information
    :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`.
       containing the genomic sequence
    :param re_name: name of the restriction enzyme used
    :param None mapper: software used to map (supported are GEM and BOWTIE2).
       Guessed from file by default.
    """
    # not nice, dirty fix in order to allow this function to only parse
    # one SAM file
    if not out_file1:
        raise Exception('ERROR: out_file1 should be given\n')
    if not re_name:
        raise Exception('ERROR: re_name should be given\n')
    if not genome_seq:
        raise Exception('ERROR: genome_seq should be given\n')
    if (f_names2 and not out_file2) or (not f_names2 and out_file2):
        raise Exception('ERROR: out_file2 AND f_names2 needed\n')

    frag_chunk = kwargs.get('frag_chunk', 100000)
    if verbose:
        print 'Searching and mapping RE sites to the reference genome'
    frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk,
                         verbose=verbose)

    if isinstance(f_names1, str):
        f_names1 = [f_names1]
    if isinstance(f_names2, str):
        f_names2 = [f_names2]
    if f_names2:
        fnames = f_names1, f_names2
        outfiles = out_file1, out_file2
    else:
        fnames = (f_names1,)
        outfiles = (out_file1, )

    # max number of reads per intermediate files for sorting
    max_size = 1000000

    windows = {}
    multis  = {}
    procs   = []
    for read in range(len(fnames)):
        if verbose:
            print 'Loading read' + str(read + 1)
        windows[read] = {}
        num = 0
        # iteration over reads
        nfile = 0
        tmp_files = []
        reads     = []
        for fnam in fnames[read]:
            try:
                fhandler = Samfile(fnam)
            except IOError:
                print 'WARNING: file "%s" not found' % fnam
                continue
            except ValueError:
                raise Exception('ERROR: not a SAM/BAM file\n%s' % fnam)
            # get the iteration number of the iterative mapping
            try:
                num = int(fnam.split('.')[-1].split(':')[0])
            except:
                num += 1
            # set read counter
            windows[read].setdefault(num, 0)
            # guess mapper used
            if not mapper:
                mapper = fhandler.header['PG'][0]['ID']
            if mapper.lower()=='gem':
                condition = lambda x: x[1][0][0] != 'N'
            elif mapper.lower() in ['bowtie', 'bowtie2']:
                condition = lambda x: 'XS' in dict(x)
            else:
                warn('WARNING: unrecognized mapper used to generate file\n')
                condition = lambda x: x[1][1] != 1
            if verbose:
                print 'loading SAM file from %s: %s' % (mapper, fnam)
            # getrname chromosome names
            i = 0
            crm_dict = {}
            while True:
                try:
                    crm_dict[i] = fhandler.getrname(i)
                    i += 1
                except ValueError:
                    break
            # iteration over reads
            sub_count = 0  # to empty read buffer
            for r in fhandler:
                if r.is_unmapped:
                    continue
                if condition(r.tags):
                    continue
                positive = not r.is_reverse
                crm      = crm_dict[r.tid]
                len_seq  = len(r.seq)
                if positive:
                    pos = r.pos + 1
                else:
                    pos = r.pos + len_seq
                try:
                    frag_piece = frags[crm][pos / frag_chunk]
                except KeyError:
                    # Chromosome not in hash
                    continue
                idx = bisect(frag_piece, pos)
                try:
                    next_re = frag_piece[idx]
                except IndexError:
                    # case where part of the read is mapped outside chromosome
                    count = 0
                    while idx >= len(frag_piece) and count < len_seq:
                        pos -= 1
                        count += 1
                        frag_piece = frags[crm][pos / frag_chunk]
                        idx = bisect(frag_piece, pos)
                    if count >= len_seq:
                        raise Exception('Read mapped mostly outside ' +
                                        'chromosome\n')
                    next_re    = frag_piece[idx]
                prev_re    = frag_piece[idx - 1 if idx else 0]
                name       = r.qname
                reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % (
                    name, crm, pos, positive, len_seq, prev_re, next_re))
                windows[read][num] += 1
                sub_count += 1
                if sub_count >= max_size:
                    sub_count = 0
                    nfile += 1
                    write_reads_to_file(reads, outfiles[read], tmp_files, nfile)
            nfile += 1
            write_reads_to_file(reads, outfiles[read], tmp_files, nfile)


        # we have now sorted temporary files
        # we do merge sort for eah pair
        if verbose:
            stdout.write('Merge sort')
            stdout.flush()
        while len(tmp_files) > 1:
            file1 = tmp_files.pop(0)
            try:
                file2 = tmp_files.pop(0)
            except IndexError:
                break
            if verbose:
                stdout.write('.')
            stdout.flush()
            nfile += 1
            tmp_files.append(merge_sort(file1, file2, outfiles[read], nfile))
        if verbose:
            stdout.write('\n')
        tmp_name = tmp_files[0]
        
        if verbose:
            print 'Getting Multiple contacts'
        reads_fh = open(outfiles[read], 'w')
        ## Also pipe file header
        # chromosome sizes (in order)
        reads_fh.write('# Chromosome lengths (order matters):\n')
        for crm in genome_seq:
            reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm])))
        reads_fh.write('# Mapped\treads count by iteration\n')
        for size in windows[read]:
            reads_fh.write('# MAPPED %d %d\n' % (size, windows[read][size]))

        ## Multicontacts
        tmp_reads_fh = open(tmp_name)
        try:
            read_line = tmp_reads_fh.next()
        except StopIteration:
            raise StopIteration('ERROR!\n Nothing parsed, check input files and'
                                ' chromosome names (in genome.fasta and SAM/MAP'
                                ' files).')
        prev_head = read_line.split('\t', 1)[0]
        prev_head = prev_head.split('~' , 1)[0]
        prev_read = read_line
        multis[read] = 0
        for read_line in tmp_reads_fh:
            head = read_line.split('\t', 1)[0]
            head = head.split('~' , 1)[0]
            if head == prev_head:
                multis[read] += 1
                prev_read =  prev_read.strip() + '|||' + read_line
            else:
                reads_fh.write(prev_read)
                prev_read = read_line
            prev_head = head
        reads_fh.write(prev_read)
        reads_fh.close()
        if clean:
            os.system('rm -rf ' + tmp_name)
    # wait for compression to finish
    for p in procs:
        p.communicate()
    return windows, multis
示例#11
0
def parse_gem_3c(f_name, out_file, genome_lengths, frags, verbose=False,
                 tmp_format=False, **kwargs):
    """
    Parse gem 3c sam file using pysam tools.

    :param f_name: path to sam file corresponding to the mapping of reads
    :param out_file: path to outfile tab separated format containing paired read information
    :param genome_lengths: a dictionary generated containing the length of the genomic sequence
                           per chromosome
    :param False tmp_format: If True leave the file prepared to be merged with other map files.
    """

    frag_chunk = kwargs.get('frag_chunk', 100000)
    try:
        fhandler = Samfile(f_name)
    except IOError:
        raise Exception('ERROR: file "%s" not found' % f_name)

    # max number of reads in buffer
    max_size = 1000000

    # getrname chromosome names
    i = 0
    crm_dict = {}
    while True:
        try:
            crm_dict[i] = fhandler.getrname(i)
            i += 1
        except ValueError:
            break
    # iteration over reads
    sub_count = 0
    nfile = 0
    tmp_files = []
    reads = []
    cur_name = ''
    write_pairs = False
    read1 = None
    read2 = []
    samiter = fhandler.fetch(until_eof=True)
    r = None
    try:
        r = next(samiter)
    except StopIteration:
        # empty SAM file
        return None
        pass
    while r:
        if not r.is_paired or r.is_unmapped or r.mapq < 4:
            try:
                r = next(samiter)
            except StopIteration:
                break
            continue

        if r.is_read1 and cur_name != r.qname:
            if read1 is None:
                read1 = r
                cur_name = r.qname
                try:
                    r = next(samiter)
                except StopIteration:
                    break
                continue
            else:
                write_pairs = True

        if not write_pairs:
            if r.is_read2 or r.is_supplementary:
                read2.append(r)
                try:
                    r = next(samiter)
                except StopIteration:
                    break
                continue
        else:
            if not read2:
                write_pairs = False
                read1 = None
                try:
                    r = next(samiter)
                except StopIteration:
                    break
                continue
            reads_grp = []
            read_id = read1.query_name
            for read in [read1]+read2:
                if read.query_name != read_id:
                    continue
                positive = not read.is_reverse
                crm      = crm_dict[read.tid]
                len_seq  = read.reference_end-read.pos
                if positive:
                    pos = read.pos + 1
                else:
                    pos = read.pos + len_seq
                try:
                    frag_piece = frags[crm][pos // frag_chunk]
                except KeyError:
                    # Chromosome not in hash
                    read_multi = []
                    break
                idx = bisect(frag_piece, pos)
                try:
                    next_re = frag_piece[idx]
                except IndexError:
                    # case where part of the read is mapped outside chromosome
                    count = 0
                    while idx >= len(frag_piece) and count < len_seq:
                        pos -= 1
                        count += 1
                        frag_piece = frags[crm][pos // frag_chunk]
                        idx = bisect(frag_piece, pos)
                    if count >= len_seq:
                        raise Exception('Read mapped mostly outside ' +
                                        'chromosome\n')
                    next_re    = frag_piece[idx]
                prev_re    = frag_piece[idx - 1 if idx else 0]
                reads_grp.append([read.tid, crm, pos, positive,
                                  len_seq, prev_re, next_re])
            if len(reads_grp) > 2:
                _merge_multis(reads_grp)
            elif len(reads_grp) < 2:
                reads_grp = []
            reads_multi = []
            for paired_reads in combinations(reads_grp, 2):
                read_multi = [item for sublist in sorted(paired_reads,key = lambda x: (x[0], x[2]))
                              for item in sublist]
                if read_multi:
                    reads_multi.append(read_multi)
                sub_count += 1

            paired_total = len(reads_multi)
            paired_nbr = 0
            for pair_read in reads_multi:
                read_name_id = read_id
                paired_nbr += 1
                if paired_total > 1:
                    read_name_id += '#%d/%d' % (paired_nbr,paired_total)
                reads.append([read_name_id]+pair_read)

            if sub_count >= max_size:
                sub_count = 0
                nfile += 1
                reads = sorted(reads, key = lambda x: (x[1], x[3], x[8], x[10]))
                read_lines = ['%s\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n'
                              % tuple(read) for read in reads]
                write_paired_reads_to_file(read_lines, out_file, tmp_files, nfile)
                #map_out.write('\n'.join(reads)+'\n')
                del reads[:]
            write_pairs = False
            read1 = None
            del read2[:]
    if reads:
        nfile += 1
        reads = sorted(reads, key = lambda x: (x[1], x[3], x[8], x[10]))
        read_lines = ['%s\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n'
                      % tuple(read) for read in reads]
        write_paired_reads_to_file(read_lines, out_file, tmp_files, nfile)
        #map_out.write('\n'.join(reads))

    #map_out.close()
    # we have now sorted temporary files
    # we do merge sort for eah pair
    if verbose:
        stdout.write('Merge sort')
        stdout.flush()
    while len(tmp_files) > 1:
        file1 = tmp_files.pop(0)
        try:
            file2 = tmp_files.pop(0)
        except IndexError:
            break
        if verbose:
            stdout.write('.')
        stdout.flush()
        nfile += 1
        tmp_files.append(merge_sort(file1, file2, out_file, nfile, paired=True))
    if verbose:
        stdout.write('\n')

    if tmp_format:
        os.rename(tmp_files[0], out_file)
    else:
        map_out   = open(out_file, 'w')
        tmp_reads_fh = open(tmp_files[0],'rb')
        for crm in genome_lengths:
            map_out.write('# CRM %s\t%d\n' % (crm, genome_lengths[crm]))
        for read_line in tmp_reads_fh:
            read = read_line.split('\t')
            map_out.write('\t'.join([read[0]]+read[2:8]+read[9:]))
        map_out.close()
        os.system('rm -rf ' + tmp_files[0])

    return out_file
示例#12
0
def parse_sam(f_names1, f_names2, frags, out_file1, out_file2, genome_seq,
              re_name, verbose=False, **kwargs):
    """
    Parse sam/bam file using pysam tools.

    Keep a summary of the results into 2 tab-separated files that will contain 6
       columns: read ID, Chromosome, position, strand (either 0 or 1), mapped
       sequence lebgth, position of the closest upstream RE site, position of
       the closest downstream RE site

    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read1, can also  be just one file
    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read2, can also  be just one file
    :param frags: a dictionary generated by :func:`pyatdbit.mapping.restriction_enzymes.map_re_sites`.

    """
    frags = map_re_sites(re_name, genome_seq, verbose=True)
    frag_chunk = kwargs.get('frag_chunk', 100000)

    fnames = f_names1, f_names2
    outfiles = out_file1, out_file2
    for read in range(2):
        if verbose:
            print 'Loading read' + str(read + 1)
        reads    = []
        for fnam in fnames[read]:
            if verbose:
                print 'loading file:', fnam
            try:
                fhandler = Samfile(fnam)
            except IOError:
                continue
            i = 0
            crm_dict = {}
            while True:
                try:
                    crm_dict[i] = fhandler.getrname(i).replace('chr', '')
                    i += 1
                except ValueError:
                    break
            for r in fhandler:
                if r.is_unmapped:
                    continue
                if r.tags[1][1] != 1:
                    continue
                positive   = not r.is_reverse
                crm        = crm_dict[r.tid]
                len_seq    = len(r.seq)
                pos        = r.pos + (0 if positive else len_seq)
                try:
                    frag_piece = frags[crm][pos / frag_chunk]
                except KeyError:
                    # Chromosome not in hash
                    continue
                idx        = bisect(frag_piece, pos)
                prev_re    = frag_piece[idx - 1]
                next_re    = frag_piece[idx]
                name       = r.qname

                reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % (
                    name, crm, pos, positive, len_seq, prev_re, next_re))
        reads_fh = open(outfiles[read], 'w')
        reads_fh.write(''.join(sorted(reads)))
        reads_fh.close()
    del(reads)
            usage()
            sys.exit()

    acc2node = itsonedb2node(fasta_itesondb)

    if outfile is None:
        sys.exit("Output file option is missing")
    match = {}
    # mappiamo prima in modalita' glocal
    if single_sam is not None:
        if os.path.exists(single_sam):
            sam = Samfile(single_sam)
            for align in sam:
                if align.tid != -1:
                    query_name, query_len, ref_name = align.qname, float(
                        align.rlen), sam.getrname(align.tid)
                    if align.cigar is not None:
                        align_len, query_aligned_len = cigar_parsing(
                            align.cigar)
                        nm = -1
                        if (query_aligned_len / query_len) * 100 >= coverage:
                            for coppia in align.tags:
                                if coppia[0] == "NM":
                                    nm = float(coppia[1])
                        if align_len != 0 and nm >= 0:
                            paired_perc_id = (
                                (align_len - nm) / align_len) * 100
                            if paired_perc_id >= identity_threshold:
                                match.setdefault(query_name, set())
                                match[query_name].add(ref_name)
            sam.close()
示例#14
0
def parse_sam(f_names1, f_names2=None, out_file1=None, out_file2=None,
              genome_seq=None, re_name=None, verbose=False, clean=True,
              mapper=None, **kwargs):
    """
    Parse sam/bam file using pysam tools.

    Keep a summary of the results into 2 tab-separated files that will contain 6
       columns: read ID, Chromosome, position, strand (either 0 or 1), mapped
       sequence lebgth, position of the closest upstream RE site, position of
       the closest downstream RE site

    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read1, can also  be just one file
    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read2, can also  be just one file
    :param out_file1: path to outfile tab separated format containing mapped
       read1 information
    :param out_file1: path to outfile tab separated format containing mapped
       read2 information
    :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`.
       containing the genomic sequence
    :param re_name: name of the restriction enzyme used
    :param None mapper: software used to map (supported are GEM and BOWTIE2).
       Guessed from file by default.
    """
    # not nice, dirty fix in order to allow this function to only parse
    # one SAM file
    if not out_file1:
        raise Exception('ERROR: out_file1 should be given\n')
    if not re_name:
        raise Exception('ERROR: re_name should be given\n')
    if not genome_seq:
        raise Exception('ERROR: genome_seq should be given\n')
    if (f_names2 and not out_file2) or (not f_names2 and out_file2):
        raise Exception('ERROR: out_file2 AND f_names2 needed\n')

    frag_chunk = kwargs.get('frag_chunk', 100000)
    if verbose:
        print('Searching and mapping RE sites to the reference genome')
    frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk,
                         verbose=verbose)

    if isinstance(f_names1, basestring):
        f_names1 = [f_names1]
    if isinstance(f_names2, basestring):
        f_names2 = [f_names2]
    if f_names2:
        fnames = f_names1, f_names2
        outfiles = out_file1, out_file2
    else:
        fnames = (f_names1,)
        outfiles = (out_file1, )

    # max number of reads per intermediate files for sorting
    max_size = 1000000

    windows = {}
    multis  = {}
    procs   = []
    for read in range(len(fnames)):
        if verbose:
            print('Loading read' + str(read + 1))
        windows[read] = {}
        num = 0
        # iteration over reads
        nfile = 0
        tmp_files = []
        reads     = []
        for fnam in fnames[read]:
            try:
                fhandler = Samfile(fnam)
            except IOError:
                print('WARNING: file "%s" not found' % fnam)
                continue
            except ValueError:
                raise Exception('ERROR: not a SAM/BAM file\n%s' % fnam)
            # get the iteration number of the iterative mapping
            try:
                num = int(fnam.split('.')[-1].split(':')[0])
            except:
                num += 1
            # set read counter
            windows[read].setdefault(num, 0)
            # guess mapper used
            if not mapper:
                mapper = fhandler.header['PG'][0]['ID']
            if mapper.lower()=='gem':
                condition = lambda x: x[1][0][0] != 'N'
            elif mapper.lower() in ['bowtie', 'bowtie2']:
                condition = lambda x: 'XS' in dict(x)
            else:
                warn('WARNING: unrecognized mapper used to generate file\n')
                condition = lambda x: x[1][1] != 1
            if verbose:
                print('loading SAM file from %s: %s' % (mapper, fnam))
            # getrname chromosome names
            i = 0
            crm_dict = {}
            while True:
                try:
                    crm_dict[i] = fhandler.getrname(i)
                    i += 1
                except ValueError:
                    break
            # iteration over reads
            sub_count = 0  # to empty read buffer
            for r in fhandler:
                if r.is_unmapped:
                    continue
                if condition(r.tags):
                    continue
                positive = not r.is_reverse
                crm      = crm_dict[r.tid]
                len_seq  = len(r.seq)
                if positive:
                    pos = r.pos + 1
                else:
                    pos = r.pos + len_seq
                try:
                    frag_piece = frags[crm][pos // frag_chunk]
                except KeyError:
                    # Chromosome not in hash
                    continue
                idx = bisect(frag_piece, pos)
                try:
                    next_re = frag_piece[idx]
                except IndexError:
                    # case where part of the read is mapped outside chromosome
                    count = 0
                    while idx >= len(frag_piece) and count < len_seq:
                        pos -= 1
                        count += 1
                        frag_piece = frags[crm][pos // frag_chunk]
                        idx = bisect(frag_piece, pos)
                    if count >= len_seq:
                        raise Exception('Read mapped mostly outside ' +
                                        'chromosome\n')
                    next_re    = frag_piece[idx]
                prev_re    = frag_piece[idx - 1 if idx else 0]
                name       = r.qname
                reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % (
                    name, crm, pos, positive, len_seq, prev_re, next_re))
                windows[read][num] += 1
                sub_count += 1
                if sub_count >= max_size:
                    sub_count = 0
                    nfile += 1
                    write_reads_to_file(reads, outfiles[read], tmp_files, nfile)
            nfile += 1
            write_reads_to_file(reads, outfiles[read], tmp_files, nfile)


        # we have now sorted temporary files
        # we do merge sort for eah pair
        if verbose:
            stdout.write('Merge sort')
            stdout.flush()
        while len(tmp_files) > 1:
            file1 = tmp_files.pop(0)
            try:
                file2 = tmp_files.pop(0)
            except IndexError:
                break
            if verbose:
                stdout.write('.')
            stdout.flush()
            nfile += 1
            tmp_files.append(merge_sort(file1, file2, outfiles[read], nfile))
        if verbose:
            stdout.write('\n')
        tmp_name = tmp_files[0]
        
        if verbose:
            print('Getting Multiple contacts')
        reads_fh = open(outfiles[read], 'w')
        ## Also pipe file header
        # chromosome sizes (in order)
        reads_fh.write('# Chromosome lengths (order matters):\n')
        for crm in genome_seq:
            reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm])))
        reads_fh.write('# Mapped\treads count by iteration\n')
        for size in windows[read]:
            reads_fh.write('# MAPPED %d %d\n' % (size, windows[read][size]))

        ## Multicontacts
        tmp_reads_fh = open(tmp_name)
        try:
            read_line = next(tmp_reads_fh)
        except StopIteration:
            raise StopIteration('ERROR!\n Nothing parsed, check input files and'
                                ' chromosome names (in genome.fasta and SAM/MAP'
                                ' files).')
        prev_head = read_line.split('\t', 1)[0]
        prev_head = prev_head.split('~' , 1)[0]
        prev_read = read_line
        multis[read] = 0
        for read_line in tmp_reads_fh:
            head = read_line.split('\t', 1)[0]
            head = head.split('~' , 1)[0]
            if head == prev_head:
                multis[read] += 1
                prev_read =  prev_read.strip() + '|||' + read_line
            else:
                reads_fh.write(prev_read)
                prev_read = read_line
            prev_head = head
        reads_fh.write(prev_read)
        reads_fh.close()
        tmp_reads_fh.close()
        if clean:
            os.system('rm -rf ' + tmp_name)
    # wait for compression to finish
    for p in procs:
        p.communicate()
    return windows, multis
示例#15
0
def _read_one_sam(fnam, mapper, verbose, frags, frag_chunk, num):
    out = open(fnam + '.tsv', 'w')
    lwindows = {}
    try:
        fhandler = Samfile(fnam)
    except IOError:
        print 'WARNING: file "%s" not found' % fnam
        return {}, []
    except ValueError:
        raise Exception('ERROR: not a SAM/BAM file\n%s' % fnam)
    # get the iteration number of the iterative mapping
    num = int(fnam.split('.')[-1].split(':')[0])
    lwindows.setdefault(num, 0)
    # guess mapper used
    if not mapper:
        mapper = fhandler.header['PG'][0]['ID']
    if mapper.lower()=='gem':
        condition = lambda x: x[1][1] != 1
    elif mapper.lower() in ['bowtie', 'bowtie2']:
        condition = lambda x: 'XS' in dict(x)
    else:
        warn('WARNING: unrecognized mapper used to generate file\n')
        condition = lambda x: x[1][1] != 1
    if verbose:
        print 'loading %s file: %s\n' % (mapper, fnam),
    # iteration over lreads
    i = 0
    crm_dict = {}
    while True:
        try:
            crm_dict[i] = fhandler.getrname(i)
            i += 1
        except ValueError:
            break
    for r in fhandler:
        if r.is_unmapped:
            continue
        if condition(r.tags):
            continue
        positive = not r.is_reverse
        crm      = crm_dict[r.tid]
        len_seq  = len(r.seq)
        if positive:
            pos = r.pos + 1
        else:
            pos = r.pos + len_seq + 1
        try:
            frag_piece = frags[crm][pos / frag_chunk]
        except KeyError:
            # Chromosome not in hash
            continue
        idx = bisect(frag_piece, pos)
        try:
            next_re = frag_piece[idx]
        except IndexError:
            # case where part of the read is mapped outside chromosome
            count = 0
            while idx >= len(frag_piece) and count < len_seq:
                pos -= 1
                count += 1
                frag_piece = frags[crm][pos / frag_chunk]
                idx = bisect(frag_piece, pos)
            if count >= len_seq:
                raise Exception('Read mapped mostly outside ' +
                                'chromosome\n')
            next_re    = frag_piece[idx]
        prev_re    = frag_piece[idx - 1 if idx else 0]
        name       = r.qname
        out.write('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % (
            name, crm, pos, positive, len_seq, prev_re, next_re))
        lwindows[num] += 1
    out.close()
    return lwindows
示例#16
0
def _read_one_sam(fnam, mapper, verbose, frags, frag_chunk, num):
    out = open(fnam + '.tsv', 'w')
    lwindows = {}
    try:
        fhandler = Samfile(fnam)
    except IOError:
        print 'WARNING: file "%s" not found' % fnam
        return {}, []
    except ValueError:
        raise Exception('ERROR: not a SAM/BAM file\n%s' % fnam)
    # get the iteration number of the iterative mapping
    num = int(fnam.split('.')[-1].split(':')[0])
    lwindows.setdefault(num, 0)
    # guess mapper used
    if not mapper:
        mapper = fhandler.header['PG'][0]['ID']
    if mapper.lower() == 'gem':
        condition = lambda x: x[1][1] != 1
    elif mapper.lower() in ['bowtie', 'bowtie2']:
        condition = lambda x: 'XS' in dict(x)
    else:
        warn('WARNING: unrecognized mapper used to generate file\n')
        condition = lambda x: x[1][1] != 1
    if verbose:
        print 'loading %s file: %s\n' % (mapper, fnam),
    # iteration over lreads
    i = 0
    crm_dict = {}
    while True:
        try:
            crm_dict[i] = fhandler.getrname(i)
            i += 1
        except ValueError:
            break
    for r in fhandler:
        if r.is_unmapped:
            continue
        if condition(r.tags):
            continue
        positive = not r.is_reverse
        crm = crm_dict[r.tid]
        len_seq = len(r.seq)
        if positive:
            pos = r.pos + 1
        else:
            pos = r.pos + len_seq + 1
        try:
            frag_piece = frags[crm][pos / frag_chunk]
        except KeyError:
            # Chromosome not in hash
            continue
        idx = bisect(frag_piece, pos)
        try:
            next_re = frag_piece[idx]
        except IndexError:
            # case where part of the read is mapped outside chromosome
            count = 0
            while idx >= len(frag_piece) and count < len_seq:
                pos -= 1
                count += 1
                frag_piece = frags[crm][pos / frag_chunk]
                idx = bisect(frag_piece, pos)
            if count >= len_seq:
                raise Exception('Read mapped mostly outside ' + 'chromosome\n')
            next_re = frag_piece[idx]
        prev_re = frag_piece[idx - 1 if idx else 0]
        name = r.qname
        out.write('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' %
                  (name, crm, pos, positive, len_seq, prev_re, next_re))
        lwindows[num] += 1
    out.close()
    return lwindows
示例#17
0
def parse_sam(f_names1, f_names2=None, out_file1=None, out_file2=None,
              genome_seq=None, re_name=None, verbose=False, mapper=None,
              **kwargs):
    """
    Parse sam/bam file using pysam tools.

    Keep a summary of the results into 2 tab-separated files that will contain 6
       columns: read ID, Chromosome, position, strand (either 0 or 1), mapped
       sequence lebgth, position of the closest upstream RE site, position of
       the closest downstream RE site

    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read1, can also  be just one file
    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read2, can also  be just one file
    :param out_file1: path to outfile tab separated format containing mapped
       read1 information
    :param out_file1: path to outfile tab separated format containing mapped
       read2 information
    :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`.
       containing the genomic sequence
    :param re_name: name of the restriction enzyme used
    :param None mapper: software used to map (supported are GEM and BOWTIE2).
       Guessed from file by default.
    """
    # not nice, dirty fix in order to allow this function to only parse
    # one SAM file
    if not out_file1:
        raise Exception('ERROR: out_file1 should be given\n')
    if not re_name:
        raise Exception('ERROR: re_name should be given\n')
    if not genome_seq:
        raise Exception('ERROR: genome_seq should be given\n')
    if (f_names2 and not out_file2) or (not f_names2 and out_file2):
        raise Exception('ERROR: out_file2 AND f_names2 needed\n')

    frag_chunk = kwargs.get('frag_chunk', 100000)
    if verbose:
        print 'Searching and mapping RE sites to the reference genome'
    frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk,
                         verbose=verbose)

    if isinstance(f_names1, str):
        f_names1 = [f_names1]
    if isinstance(f_names2, str):
        f_names2 = [f_names2]
    if f_names2:
        fnames = f_names1, f_names2
        outfiles = out_file1, out_file2
    else:
        fnames = (f_names1,)
        outfiles = (out_file1, )

    for read in range(len(fnames)):
        if verbose:
            print 'Loading read' + str(read + 1)
        reads    = []
        for fnam in fnames[read]:
            if verbose:
                print 'loading file:', fnam
            try:
                fhandler = Samfile(fnam)
            except IOError:
                continue
            # guess mapper used
            if not mapper:
                mapper = fhandler.header['PG'][0]['ID']
            if mapper.lower()=='gem':
                condition = lambda x: x[1][1] != 1
            elif mapper.lower() in ['bowtie', 'bowtie2']:
                condition = lambda x: 'XS' in dict(x)
            else:
                warn('WARNING: unrecognized mapper used to generate file\n')
                condition = lambda x: x[1][1] != 1
            if verbose:
                print 'MAPPER:', mapper
            # iteration over reads
            i = 0
            crm_dict = {}
            while True:
                try:
                    crm_dict[i] = fhandler.getrname(i)
                    i += 1
                except ValueError:
                    break
            for r in fhandler:
                if r.is_unmapped:
                    continue
                if condition(r.tags):
                    continue
                positive = not r.is_reverse
                crm      = crm_dict[r.tid]
                len_seq  = len(r.seq)
                pos      = r.pos + (0 if positive else len_seq)
                try:
                    frag_piece = frags[crm][pos / frag_chunk]
                except KeyError:
                    # Chromosome not in hash
                    continue
                idx        = bisect(frag_piece, pos)
                try:
                    next_re    = frag_piece[idx]
                except IndexError:
                    # case where part of the read is mapped outside chromosome
                    count = 0
                    while idx >= len(frag_piece) and count < len_seq:
                        pos -= 1
                        count += 1
                        frag_piece = frags[crm][pos / frag_chunk]
                        idx        = bisect(frag_piece, pos)
                    if count >= len_seq:
                        raise Exception('Read mapped mostly outside ' +
                                        'chromosome\n')
                    next_re    = frag_piece[idx]
                prev_re    = frag_piece[idx - 1]
                name       = r.qname

                reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % (
                    name, crm, pos, positive, len_seq, prev_re, next_re))
        reads_fh = open(outfiles[read], 'w')
        ## write file header
        # chromosome sizes (in order)
        reads_fh.write('## Chromosome lengths (order matters):\n')
        for crm in genome_seq:
            reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm])))
        reads_fh.write(''.join(sorted(reads)))
        reads_fh.close()
    del(reads)
示例#18
0
errors2segments = defaultdict(lambda: defaultdict(list));
samfile = Samfile(args.path)
for segment in samfile.fetch(until_eof=True):
	num = segment.query_name.split("|")[0]
	for etype, eset in errors.iteritems():
		if(num in eset):
			errors2segments[etype][num].append(segment);
			break;
		
		
additional = defaultdict(list);
for fname in args.additional:
	tsamfile = Samfile(fname);
	for segment in tsamfile.fetch(until_eof=True):
		num = segment.query_name.split("|")[0]
		additional[num].append(ArWrapper(segment, tsamfile.getrname(segment.tid)))
	tsamfile.close();
		
		
		
		
for etype, d in errors2segments.iteritems():
	with open(os.path.join(args.outdir, "%s_%s_error.txt" % etype), 'w') as f:
		for num, segments in d.iteritems():
			if(segments[0].is_reverse):
				seq = reverse_complement(segments[0].seq);
			else:	
				seq = segments[0].seq
			
			f.write("%s\nnumber of read:\t%s\n\nSequence:\t%s\n\nSegments:\n\n" % ("_"*140, num, seq))
			for segment in segments:
示例#19
0
def parse_sam(f_names1, f_names2=None, out_file1=None, out_file2=None,
              genome_seq=None, re_name=None, verbose=False, mapper=None,
              **kwargs):
    """
    Parse sam/bam file using pysam tools.

    Keep a summary of the results into 2 tab-separated files that will contain 6
       columns: read ID, Chromosome, position, strand (either 0 or 1), mapped
       sequence lebgth, position of the closest upstream RE site, position of
       the closest downstream RE site

    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read1, can also  be just one file
    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read2, can also  be just one file
    :param out_file1: path to outfile tab separated format containing mapped
       read1 information
    :param out_file1: path to outfile tab separated format containing mapped
       read2 information
    :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`.
       containing the genomic sequence
    :param re_name: name of the restriction enzyme used
    :param None mapper: software used to map (supported are GEM and BOWTIE2).
       Guessed from file by default.
    """
    # not nice, dirty fix in order to allow this function to only parse
    # one SAM file
    if not out_file1:
        raise Exception('ERROR: out_file1 should be given\n')
    if not re_name:
        raise Exception('ERROR: re_name should be given\n')
    if not genome_seq:
        raise Exception('ERROR: genome_seq should be given\n')
    if (f_names2 and not out_file2) or (not f_names2 and out_file2):
        raise Exception('ERROR: out_file2 AND f_names2 needed\n')

    frag_chunk = kwargs.get('frag_chunk', 100000)
    if verbose:
        print 'Searching and mapping RE sites to the reference genome'
    frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk,
                         verbose=verbose)

    if isinstance(f_names1, str):
        f_names1 = [f_names1]
    if isinstance(f_names2, str):
        f_names2 = [f_names2]
    if f_names2:
        fnames = f_names1, f_names2
        outfiles = out_file1, out_file2
    else:
        fnames = (f_names1,)
        outfiles = (out_file1, )

    for read in range(len(fnames)):
        if verbose:
            print 'Loading read' + str(read + 1)
        windows = {}
        reads    = []
        num = 0
        for fnam in fnames[read]:
            try:
                fhandler = Samfile(fnam)
            except IOError:
                print 'WARNING: file "%s" not found' % fnam
                continue
            except ValueError:
                raise Exception('ERROR: not a SAM/BAM file\n%s' % fnam)
            # get the iteration number of the iterative mapping
            try:
                num = int(fnam.split('.')[-1].split(':')[0])
            except:
                num += 1
            windows.setdefault(num, 0)
            # guess mapper used
            if not mapper:
                mapper = fhandler.header['PG'][0]['ID']
            if mapper.lower()=='gem':
                condition = lambda x: x[1][1] != 1
            elif mapper.lower() in ['bowtie', 'bowtie2']:
                condition = lambda x: 'XS' in dict(x)
            else:
                warn('WARNING: unrecognized mapper used to generate file\n')
                condition = lambda x: x[1][1] != 1
            if verbose:
                print 'loading %s file: %s' % (mapper, fnam)
            # iteration over reads
            i = 0
            crm_dict = {}
            while True:
                try:
                    crm_dict[i] = fhandler.getrname(i)
                    i += 1
                except ValueError:
                    break
            for r in fhandler:
                if r.is_unmapped:
                    continue
                if condition(r.tags):
                    continue
                positive = not r.is_reverse
                crm      = crm_dict[r.tid]
                len_seq  = len(r.seq)
                if positive:
                    pos = r.pos + 1
                else:
                    pos = r.pos + len_seq + 1
                try:
                    frag_piece = frags[crm][pos / frag_chunk]
                except KeyError:
                    # Chromosome not in hash
                    continue
                idx = bisect(frag_piece, pos)
                try:
                    next_re = frag_piece[idx]
                except IndexError:
                    # case where part of the read is mapped outside chromosome
                    count = 0
                    while idx >= len(frag_piece) and count < len_seq:
                        pos -= 1
                        count += 1
                        frag_piece = frags[crm][pos / frag_chunk]
                        idx = bisect(frag_piece, pos)
                    if count >= len_seq:
                        raise Exception('Read mapped mostly outside ' +
                                        'chromosome\n')
                    next_re    = frag_piece[idx]
                prev_re    = frag_piece[idx - 1 if idx else 0]
                name       = r.qname
                reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % (
                    name, crm, pos, positive, len_seq, prev_re, next_re))
                windows[num] += 1
        reads_fh = open(outfiles[read], 'w')
        ## write file header
        # chromosome sizes (in order)
        reads_fh.write('## Chromosome lengths (order matters):\n')
        for crm in genome_seq:
            reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm])))
        reads_fh.write('## Number of mapped reads by iteration\n')
        for size in windows:
            reads_fh.write('# MAPPED %d %d\n' % (size, windows[size]))
        reads_fh.write(''.join(sorted(reads)))
        reads_fh.close()
    del reads