示例#1
0
def bam_count(args):
    bam = HTSeq.SAM_Reader(args.fi)
    #exons = htseq_read_gtf(args.fg)
    cnts = collections.Counter()
    for bundle in HTSeq.pair_SAM_alignments_with_buffer(bam):
        if len(bundle) != 1:
            continue
        aln1, aln2 = bundle[0]
        if not aln1.aligned and aln2.aligned:
            cnts["_unmapped"] += 1
            continue
        gids = set()
        for iv, val in exons[aln1.iv].steps():
            gids |= val
        for iv, val in exons[aln2.iv].steps():
            gids |= val
        if len(gids) == 1:
            gid = list(gids)[0]
            cnts[gid] += 1
        elif len(gids) == 0:
            cnts["_no_feature"] += 1
        else:
            cnts["_ambiguous"] += 1
    for gid in cnts:
        print("%s\t%d" % (gid, cnts[gid]))
示例#2
0
def bam_count(args):
    bam = HTSeq.SAM_Reader(args.fi)
    #exons = htseq_read_gtf(args.fg)
    cnts = collections.Counter()
    for bundle in HTSeq.pair_SAM_alignments_with_buffer(bam):
        if len(bundle) != 1:
            continue
        aln1, aln2 = bundle[0]
        if not aln1.aligned and aln2.aligned:
            cnts["_unmapped"] += 1
            continue
        gids = set()
        for iv, val in exons[aln1.iv].steps():
            gids |= val
        for iv, val in exons[aln2.iv].steps():
            gids |= val
        if len(gids) == 1:
            gid = list(gids)[0]
            cnts[gid] += 1
        elif len(gids) == 0:
            cnts["_no_feature"] += 1
        else:
            cnts["_ambiguous"] += 1
    for gid in cnts:
        print("%s\t%d" % (gid, cnts[gid]))
示例#3
0
def Get_label_information(label, annot, bam_reader):
    warnings.simplefilter("ignore")
    gas = HTSeq.GenomicArrayOfSets("auto", stranded=False)
    ga = HTSeq.GenomicArray("auto", stranded=False, typecode="i")
    gene_count = {}
    for feature, rank, chrom, start, end, strand, length, exon_rank_left, exon_rank_right in annot[
            label]:
        iv = HTSeq.GenomicInterval(chrom, start, end, strand)
        gas[iv] += (feature, rank)
        gene_count[(feature, rank)] = 0
    boundary_left, boundary_right = min([i[3] for i in annot[label]
                                         ]), max([i[4] for i in annot[label]])
    region_fetch = annot[label][0][2] + ":" + str(
        int(boundary_left) - 500) + "-" + str(int(boundary_right) + 500)
    read_seq = bam_reader.fetch(region=region_fetch)
    read_seq_iter = iter(bam_reader.fetch())
    one_read = next(read_seq_iter)
    pe_mode = one_read.paired_end
    if pe_mode:
        read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq)
    for a in read_seq:
        if not pe_mode:
            if not a.aligned:
                continue
            if a.optional_field('NH') > 1:
                continue
            iv_seq = (cigop.ref_iv for cigop in a.cigar
                      if cigop.type == "M" and cigop.size > 0)
        else:
            if ((a[0] and a[0].aQual < minaqual)
                    or (a[1] and a[1].aQual < minaqual)):
                continue
            if ((a[0] and a[0].optional_field('NH') > 1)
                    or (a[1] and a[1].optional_field('NH') > 1)):
                continue
            if a[0] is not None and a[0].aligned:
                iv_seq = (cigop.ref_iv for cigop in a[0].cigar
                          if cigop.type in cigar_char and cigop.size > 0)
            else:
                iv_seq = tuple()
            if a[1] is not None and a[1].aligned:
                iv_seq = itertools.chain(
                    iv_seq, (invert_strand(cigop.ref_iv)
                             for cigop in a[1].cigar
                             if cigop.type in cigar_char and cigop.size > 0))
        feature_aligned = set()
        for iv in iv_seq:
            for iv2, val2 in gas[iv].steps():
                feature_aligned |= val2
                ga[iv] += 1  # for calculating coverage
        if len(feature_aligned) == 0:
            continue
        for f in [item for item in feature_aligned if item[0] == 'intron']:
            gene_count[f] += 1
        if 'intron' not in [x for x, y in feature_aligned]:
            for f in feature_aligned:
                gene_count[f] += 1
    return gas, ga, gene_count
示例#4
0
def _calc_insert_size(regions_path: str, bam_paths: List[str], out_path: str,
                      mc: MessageCenter):
    """Calculate insert sizes of each read pairs mapped to long continuous regions."""
    mc.log_debug('regions_path: {}'.format(regions_path))
    mc.log_debug('bam_paths: {}'.format(', '.join(bam_paths)))
    mc.log_debug('out_path: {}'.format(out_path))

    mc.handle_progress('Collecting insert sizes...')

    if not os.path.exists(regions_path):
        raise PEUtilPathError(regions_path, 'File not exists.')

    bam_readers = []
    for bam_path in bam_paths:
        if not os.path.exists(bam_path):
            raise PEUtilPathError(bam_path, 'File not exists.')
        bam_reader = HTSeq.BAM_Reader(bam_path)
        bam_readers.append(bam_reader)

    with open(regions_path) as f, open(out_path, 'w') as o:
        n = -1
        for row in f:
            row = row.strip('\n')
            n += 1
            if n != 0 and n % 1000 == 0:
                mc.handle_progress('{} regions processed...'.format(n))
            cells = row.split('\t')
            try:
                chrom, start, end = cells[:3]
                start, end = int(start), int(end)
            except ValueError:
                raise PEUtilParseError(regions_path, 'Incorrect file format.')
            insert_sizes = []
            for bam_reader in bam_readers:
                alns = bam_reader.fetch(chrom, start, end)
                for aln1, aln2 in HTSeq.pair_SAM_alignments_with_buffer(alns):
                    aln1 = aln1  # type: HTSeq.SAM_Alignment
                    aln2 = aln2  # type: HTSeq.SAM_Alignment
                    if (aln1 is None) or (aln2 is None):
                        continue

                    should_skip = False
                    for aln in [aln1, aln2]:
                        assert aln.aligned
                        assert aln.iv.start < aln.iv.end
                        if aln.not_primary_alignment or aln.pcr_or_optical_duplicate or aln.failed_platform_qc:
                            should_skip = True
                            break
                        if (aln.iv.start < start) or (end < aln.iv.end):
                            should_skip = True
                            break
                    if should_skip:
                        continue

                    insert_sizes.append(str(np.abs(aln1.inferred_insert_size)))
            o.write('{0}:{1}-{2}({3})\t{4}\n'.format(chrom, start, end,
                                                     end - start,
                                                     ';'.join(insert_sizes)))
示例#5
0
def Get_Skipstart_dict(region_fetch, all_bamfiles, strand):
    skip_list = []
    for bamfile in all_bamfiles:
        bam_reader = HTSeq.BAM_Reader(bamfile)
        read_seq = bam_reader.fetch(region=region_fetch)
        read_seq_iter = iter(bam_reader.fetch())
        one_read = next(read_seq_iter)
        pe_mode = one_read.paired_end
        if pe_mode:
            read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq)
        for a in read_seq:
            if not pe_mode:
                if not a.aligned:
                    continue
                if a.optional_field('NH') > 1:
                    continue
                if strand == "+":
                    skip_list.extend([
                        int(cigop.ref_iv.start) for cigop in a.cigar
                        if cigop.type == "N" and cigop.size > 0
                    ])
                else:
                    skip_list.extend([
                        int(cigop.ref_iv.end) for cigop in a.cigar
                        if cigop.type == "N" and cigop.size > 0
                    ])
            else:
                if ((a[0] and a[0].aQual < minaqual)
                        or (a[1] and a[1].aQual < minaqual)):
                    continue
                if ((a[0] and a[0].optional_field('NH') > 1)
                        or (a[1] and a[1].optional_field('NH') > 1)):
                    continue
                if a[0] is not None and a[0].aligned:
                    if strand == "+":
                        skip_list.extend([
                            int(cigop.ref_iv.start) for cigop in a[0].cigar
                            if cigop.type == "N" and cigop.size > 0
                        ])
                    else:
                        skip_list.extend([
                            int(cigop.ref_iv.end) for cigop in a[0].cigar
                            if cigop.type == "N" and cigop.size > 0
                        ])
                if a[1] is not None and a[1].aligned:
                    if strand == "+":
                        skip_list.extend([
                            int(cigop.ref_iv.start) for cigop in a[1].cigar
                            if cigop.type == "N" and cigop.size > 0
                        ])
                    else:
                        skip_list.extend([
                            int(cigop.ref_iv.end) for cigop in a[1].cigar
                            if cigop.type == "N" and cigop.size > 0
                        ])
    skip_dict = dict(collections.Counter(skip_list))
    return skip_dict
示例#6
0
def count_reads_paired(read_seq, counter, order, quiet, minaqual):

    if order == "name":
        read_seq = HTSeq.pair_SAM_alignments(read_seq)
    elif order == "pos":
        read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq)
    else:
        raise ValueError("Illegal order specified.")

    i = 0
    for r in read_seq:
        if i > 0 and i % 100000 == 0 and not quiet:
            msg = "%d SAM alignment record pairs processed.\n" % (i)
            sys.stderr.write(msg)

        i += 1
        if r[0] is not None and r[0].aligned:
            forward_iv_seq = (co.ref_iv for co in r[0].cigar
                              if co.type == "M" and co.size > 0)
            reverse_iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                              if co.type == "M" and co.size > 0)
        else:
            forward_iv_seq = tuple()
            reverse_iv_seq = tuple()
        if r[1] is not None and r[1].aligned:
            rest = (invert_strand(co.ref_iv) for co in r[1].cigar
                    if co.type == "M" and co.size > 0)
            forward_iv_seq = itertools.chain(forward_iv_seq, rest)
            rest = (co.ref_iv for co in r[1].cigar
                    if co.type == "M" and co.size > 0)
            reverse_iv_seq = itertools.chain(reverse_iv_seq, rest)
        else:
            if (r[0] is None) or not (r[0].aligned):
                counter.not_aligned(r)
                continue
        try:
            if (r[0] is not None and r[0].optional_field("NH") > 1) or \
                    (r[1] is not None and r[1].optional_field("NH") > 1):
                counter.non_unique(r)
                continue
        except KeyError:
            pass
        if (r[0] and r[0].aQual < minaqual) or \
                (r[1] and r[1].aQual < minaqual):
            forward_counter.too_low_quality(r)
            continue

        counter.forward_count(forward_iv_seq, r)
        counter.reverse_count(reverse_iv_seq, r)

    if not quiet:
        sys.stderr.write("%d SAM alignment pairs processed.\n" % (i))
def count_reads_paired(read_seq, counter, order, quiet, minaqual):

    if order == "name":
        read_seq = HTSeq.pair_SAM_alignments(read_seq)
    elif order == "pos":
        read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq)
    else:
        raise ValueError("Illegal order specified.")

    i = 0
    for r in read_seq:
        if i > 0 and i % 100000 == 0 and not quiet:
            msg = "%d SAM alignment record pairs processed.\n" % (i)
            sys.stderr.write(msg)

        i += 1
        if r[0] is not None and r[0].aligned:
            forward_iv_seq = (co.ref_iv for co in r[0].cigar
                              if co.type == "M" and co.size > 0)
            reverse_iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                              if co.type == "M" and co.size > 0)
        else:
            forward_iv_seq = tuple()
            reverse_iv_seq = tuple()
        if r[1] is not None and r[1].aligned:
            rest = (invert_strand(co.ref_iv) for co in r[1].cigar
                    if co.type == "M" and co.size > 0)
            forward_iv_seq = itertools.chain(forward_iv_seq, rest)
            rest = (co.ref_iv for co in r[1].cigar
                    if co.type == "M" and co.size > 0)
            reverse_iv_seq = itertools.chain(reverse_iv_seq, rest)
        else:
            if (r[0] is None) or not (r[0].aligned):
                counter.not_aligned(r)
                continue
        try:
            if (r[0] is not None and r[0].optional_field("NH") > 1) or \
                    (r[1] is not None and r[1].optional_field("NH") > 1):
                counter.non_unique(r)
                continue
        except KeyError:
            pass
        if (r[0] and r[0].aQual < minaqual) or \
                (r[1] and r[1].aQual < minaqual):
            counter.too_low_quality(r)
            continue

        counter.forward_count(forward_iv_seq, r)
        counter.reverse_count(reverse_iv_seq, r)

    if not quiet:
        sys.stderr.write("%d SAM alignment pairs processed.\n" % (i))
def count_reads_paired(read_seq, counter, order, stranded, 
      quiet, minaqual, write_to_samout ):
      
    if order == "name":
        read_seq = HTSeq.pair_SAM_alignments( read_seq )
    elif order == "pos":
        read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq )
    else:
        raise ValueError, "Illegal order specified."

    i = 0   
    for r in read_seq:
        if i > 0 and i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d SAM alignment record%s processed.\n" % ( i, "s" if not pe_mode else " pairs" ) )

        i += 1
        if r[0] is not None and r[0].aligned:
            if stranded != "reverse":
                iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 )
            else:
                iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 )
        else:
            iv_seq = tuple()
        if r[1] is not None and r[1].aligned:            
            if stranded != "reverse":
                iv_seq = itertools.chain(iv_seq, 
                    ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
            else:
                iv_seq = itertools.chain( iv_seq, 
                    ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
        else:
            if ( r[0] is None ) or not ( r[0].aligned ):
                write_to_samout( r, "__not_aligned" )
                counter.notaligned += 1
                continue         
        try:
            if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                     ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
                counter.nonunique += 1
                write_to_samout( r, "__alignment_not_unique" )
                continue
        except KeyError:
            pass
        if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
            lowqual += 1
            write_to_samout( r, "__too_low_aQual" )
            continue         
        
        counter.count(iv_seq, r)
         
    if not quiet:
        sys.stderr.write( "%d SAM %s processed.\n" % ( i, "alignments " if not pe_mode else "alignment pairs" ) )
示例#9
0
def count_reads(sam_filename, features, counts, samtype, order, forward,
                reverse, overlap_mode, quiet, minaqual, samout, directory):

    def write_to_samout(r, assignment):
        if samoutfile is None:
            return
        if not pe_mode:
            r = (r,)
        for read in r:
            if read is not None:
                samoutfile.write(read.original_sam_line.rstrip() +
                                 "\tXF:Z:" + assignment + "\n")

    if samout != "":
        samoutfile = open(samout, "w")
    else:
        samoutfile = None

    if samtype is None:
        samtype = detect_sam_type(sam_filename)

    if samtype == "sam":
        SAM_or_BAM_Reader = HTSeq.SAM_Reader
    elif samtype == "bam":
        SAM_or_BAM_Reader = HTSeq.BAM_Reader
    else:
        raise ValueError("Unknown input format %s specified." % samtype)

    try:
        if sam_filename != "-":
            read_seq_file = SAM_or_BAM_Reader(sam_filename)
            read_seq = read_seq_file
            first_read = iter(read_seq).next()
        else:
            read_seq_file = SAM_or_BAM_Reader(sys.stdin)
            read_seq_iter = iter(read_seq_file)
            first_read = read_seq_iter.next()
            read_seq = itertools.chain([first_read], read_seq_iter)
        pe_mode = first_read.paired_end
    except:
        sys.stderr.write("Error occured when reading beginning "
                         "of SAM/BAM file.\n")
        raise

    try:
        if pe_mode:
            if order == "name":
                read_seq = HTSeq.pair_SAM_alignments(read_seq)
            elif order == "pos":
                read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq)
            else:
                raise ValueError("Illegal order specified.")
        if forward:
            empty_forward = 0
            ambiguous_forward = 0
            counts_forward = copy.copy(counts)
        if reverse:
            empty_reverse = 0
            ambiguous_reverse = 0
            counts_reverse = copy.copy(counts)
        notaligned = 0
        lowqual = 0
        nonunique = 0
        i = 0
        for r in read_seq:
            if i > 0 and i % 100000 == 0 and not quiet:
                sys.stderr.write("%d SAM alignment record%s processed.\n" %
                                 (i, "s" if not pe_mode else " pairs"))

            i += 1
            if not pe_mode:
                if not r.aligned:
                    notaligned += 1
                    write_to_samout(r, "__not_aligned")
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        nonunique += 1
                        write_to_samout(r, "__alignment_not_unique")
                        continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    lowqual += 1
                    write_to_samout(r, "__too_low_aQual")
                    continue
                if forward:
                    iv_seq_for = (co.ref_iv for co in r.cigar
                                  if co.type == "M" and co.size > 0)
                if reverse:
                    iv_seq_rev = (invert_strand(co.ref_iv) for co in r.cigar
                                  if co.type == "M" and co.size > 0)
            else:
                if r[0] is not None and r[0].aligned:
                    if forward:
                        iv_seq_for = (co.ref_iv for co in r[0].cigar
                                      if co.type == "M" and co.size > 0)
                    if reverse:
                        iv_seq_rev = (invert_strand(co.ref_iv) for co in
                                      r[0].cigar if co.type == "M"
                                      and co.size > 0)
                else:
                    iv_seq_rev = tuple()
                    iv_seq_for = tuple()
                if r[1] is not None and r[1].aligned:
                    if forward:
                        iv_seq_for = (itertools.chain(iv_seq_for,
                                      (invert_strand(co.ref_iv)
                                       for co in r[1].cigar if co.type == "M"
                                       and co.size > 0)))
                    if reverse:
                        iv_seq_rev = itertools.chain(iv_seq_rev, (co.ref_iv
                                                     for co in r[1].cigar
                                                     if co.type == "M"
                                                     and co.size > 0))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        write_to_samout(r, "__not_aligned")
                        notaligned += 1
                        continue
                try:
                    if ((r[0] is not None and r[0].optional_field("NH") > 1)
                            or (r[1] is not None and
                                r[1].optional_field("NH") > 1)):
                        nonunique += 1
                        write_to_samout(r, "__alignment_not_unique")
                        continue
                except KeyError:
                    pass
                if ((r[0] and r[0].aQual < minaqual) or
                        (r[1] and r[1].aQual < minaqual)):
                    lowqual += 1
                    write_to_samout(r, "__too_low_aQual")
                    continue

            try:
                if overlap_mode == "union":
                    if forward:
                        fs_for = set()
                        for iv in iv_seq_for:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                fs_for = fs_for.union(fs2)
                    if reverse:
                        fs_rev = set()
                        for iv in iv_seq_rev:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                fs_rev = fs_rev.union(fs2)
                elif (overlap_mode == "intersection-strict" or
                        overlap_mode == "intersection-nonempty"):
                    if forward:
                        fs_for = None
                        for iv in iv_seq_for:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                if len(fs2) > 0 or \
                                        overlap_mode == "intersection-strict":
                                    if fs_for is None:
                                        fs_for = fs2.copy()
                                    else:
                                        fs_for = fs_for.intersection(fs2)
                    if reverse:
                        fs_reverse = None
                        for iv in iv_seq_rev:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                if len(fs2) > 0 or \
                                        overlap_mode == "intersection-strict":
                                    if fs_rev is None:
                                        fs_rev = fs2.copy()
                                    else:
                                        fs_rev = fs_rev.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")
                if forward:
                    if fs_for is None or len(fs_for) == 0:
                        write_to_samout(r, "__no_feature")
                        empty_forward += 1
                    elif len(fs_for) > 1:
                        write_to_samout(r, "__ambiguous[" +
                                        '+'.join(fs_for) + "]")
                        ambiguous_forward += 1
                    else:
                        write_to_samout(r, list(fs_for)[0])
                        counts_forward[list(fs_for)[0]] += 1
                if reverse:
                    if fs_reverse is None or len(fs_rev) == 0:
                        write_to_samout(r, "__no_feature")
                        empty_reverse += 1
                    elif len(fs_reverse) > 1:
                        write_to_samout(r, "__ambiguous[" +
                                        '+'.join(fs_rev) + "]")
                        ambiguous_reverse += 1
                    else:
                        write_to_samout(r, list(fs_rev)[0])
                        counts_reverse[list(fs_rev)[0]] += 1
            except UnknownChrom:
                write_to_samout(r, "__no_feature")
                empty_forward += 1
                empty_reverse += 1

    except:
        sys.stderr.write("Error occured when processing SAM input (%s):\n" %
                         read_seq_file.get_line_number_string())
        raise

    if not quiet:
        sys.stderr.write("%d SAM %s processed.\n" %
                         (i, "alignments "
                          if not pe_mode else "alignment pairs"))

    if samoutfile is not None:
        samoutfile.close()

    if forward:
        output = brenninc_utils.create_new_file(sam_filename,
                                                "_forward_count",
                                                outputdir=directory,
                                                extension="txt",
                                                gzipped=False)
        used_features_count = 0
        used_features_sum = 0
        print "Forward written to", output
        with open(output, "w") as output_file:
            for fn in sorted(counts_forward.keys()):
                output_file.write("%s\t%d\n" % (fn, counts_forward[fn]))
                used_features_count += 1
                used_features_sum += counts_forward[fn]
            output_file.write("__no_feature\t%d\n" % empty_forward)
            output_file.write("__ambiguous\t%d\n" % ambiguous_forward)
            output_file.write("__too_low_aQual\t%d\n" % lowqual)
            output_file.write("__not_aligned\t%d\n" % notaligned)
            output_file.write("__alignment_not_unique\t%d\n" % nonunique)
        print "Forward features with alignment\t%d" % used_features_count
        print "Forward alignments asigned to feature\t%d" % used_features_sum
        print "__forward_no_feature\t%d" % empty_forward
        print "__forward_ambiguous\t%d" % ambiguous_forward
    if reverse:
        output = brenninc_utils.create_new_file(sam_filename,
                                                "_reverse_count",
                                                outputdir=directory,
                                                extension="txt",
                                                gzipped=False)
        used_features_count = 0
        used_features_sum = 0
        print "Reverse written to", output
        with open(output, "w") as output_file:
            for fn in sorted(counts_reverse.keys()):
                output.write("%s\t%d\n" % (fn, counts_reverse[fn]))
                used_features_count += 1
                used_features_sum += counts_reverse[fn]
            output_file.write("__no_feature\t%d\n" % empty_reverse)
            output_file.write("__ambiguous\t%d\n" % ambiguous_reverse)
            output_file.write("__too_low_aQual\t%d\n" % lowqual)
            output_file.write("__not_aligned\t%d\n" % notaligned)
            output_file.write("__alignment_not_unique\t%d\n" % nonunique)
        print "Reverse features with alignment\t%d" % used_features_count
        print "Reverse alignments asigned to feature\t%d" % used_features_sum
        print "__reverse_no_feature\t%d" % empty_reverse
        print "__reverse_ambiguous\t%d" % ambiguous_reverse
    print "__too_low_aQual\t%d" % lowqual
    print "__not_aligned\t%d" % notaligned
    print "__alignment_not_unique\t%d" % nonunique
示例#10
0
def count_reads_paired(read_seq, forward_counter, reverse_counter, order, 
      quiet, minaqual, write_to_samout ):
      
    if order == "name":
        read_seq = HTSeq.pair_SAM_alignments( read_seq )
    elif order == "pos":
        read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq )
    else:
        raise ValueError, "Illegal order specified."

    i = 0   
    for r in read_seq:
        if i > 0 and i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d SAM alignment record pairs processed.\n" % ( i ) )

        i += 1
        if r[0] is not None and r[0].aligned:
            if forward_counter is not None:
                forward_iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 )
            if reverse_counter is not None:
                reverse_iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 )
        else:
            forward_iv_seq = tuple()
            reverse_iv_seq = tuple()
        if r[1] is not None and r[1].aligned:            
            if forward_counter is not None:
                forward_iv_seq = itertools.chain(forward_iv_seq, 
                    ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
            if reverse_counter is not None:
                reverse_iv_seq = itertools.chain( reverse_iv_seq, 
                    ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
        else:
            if ( r[0] is None ) or not ( r[0].aligned ):
                write_to_samout( r, "__not_aligned" )
                if forward_counter is not None:
                    forward_Counter.notaligned += 1
                if reverse_counter is not None:
                    reverse_counter.notaligned += 1
                continue         
        try:
            if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                     ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
                if forward_counter is not None:
                    forward_counter.nonunique += 1
                if reverse_counter is not None:
                    reverse_counter.nonunique += 1
                write_to_samout( r, "__alignment_not_unique" )
                continue
        except KeyError:
            pass
        if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
            if forward_counter is not None:
                forward_counter.lowqual += 1
            if reverse_counter is not None:
                reverse_counter.lowqual += 1
            write_to_samout( r, "__too_low_aQual" )
            continue         
        
        if forward_counter is not None:
            forward_counter.count(forward_iv_seq, r)
        if reverse_counter is not None:
            reverse_counter.count(reverse_iv_seq, r)
         
    if not quiet:
        sys.stderr.write( "%d SAM alignment pairs processed.\n" % ( i) )
def count_reads(features, counts, pe_mode, read_seq, order, stranded, 
      overlap_mode, quiet, minaqual, write_to_samout ):
      
    if pe_mode:
        if order == "name":
            read_seq = HTSeq.pair_SAM_alignments( read_seq )
        elif order == "pos":
            read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq )
        else:
            raise ValueError, "Illegal order specified."
    empty = 0
    ambiguous = 0
    notaligned = 0
    lowqual = 0
    nonunique = 0
    i = 0   
    for r in read_seq:
        if i > 0 and i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d SAM alignment record%s processed.\n" % ( i, "s" if not pe_mode else " pairs" ) )

        i += 1
        if not pe_mode:
            if not r.aligned:
                notaligned += 1
                write_to_samout( r, "__not_aligned" )
                continue
            try:
                if r.optional_field( "NH" ) > 1:
                    nonunique += 1
                    write_to_samout( r, "__alignment_not_unique" )
                    continue
            except KeyError:
                pass
            if r.aQual < minaqual:
                lowqual += 1
                write_to_samout( r, "__too_low_aQual" )
                continue
            if stranded != "reverse":
                iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 )
            else:
                iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 )            
        else:
            if r[0] is not None and r[0].aligned:
                if stranded != "reverse":
                    iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 )
                else:
                    iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 )
            else:
                iv_seq = tuple()
            if r[1] is not None and r[1].aligned:            
                if stranded != "reverse":
                    iv_seq = itertools.chain(iv_seq, 
                        ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
                else:
                    iv_seq = itertools.chain( iv_seq, 
                        ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
            else:
                if ( r[0] is None ) or not ( r[0].aligned ):
                    write_to_samout( r, "__not_aligned" )
                    notaligned += 1
                    continue         
            try:
                if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                         ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
                    nonunique += 1
                    write_to_samout( r, "__alignment_not_unique" )
                    continue
            except KeyError:
                pass
            if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
                lowqual += 1
                write_to_samout( r, "__too_low_aQual" )
                continue         
         
        try:
            if overlap_mode == "union":
                fs = set()
                for iv in iv_seq:
                    if iv.chrom not in features.chrom_vectors:
                        raise UnknownChrom
                    for iv2, fs2 in features[ iv ].steps():
                        fs = fs.union( fs2 )
            elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
                fs = None
                for iv in iv_seq:
                    if iv.chrom not in features.chrom_vectors:
                        raise UnknownChrom
                    for iv2, fs2 in features[ iv ].steps():
                        if len(fs2) > 0 or overlap_mode == "intersection-strict":
                            if fs is None:
                                fs = fs2.copy()
                            else:
                                fs = fs.intersection( fs2 )
            else:
                sys.exit( "Illegal overlap mode." )
            if fs is None or len( fs ) == 0:
                write_to_samout( r, "__no_feature" )
                empty += 1
            elif len( fs ) > 1:
                write_to_samout( r, "__ambiguous[" + '+'.join( fs ) + "]" )
                ambiguous += 1
            else:
                write_to_samout( r, list(fs)[0] )
                counts[ list(fs)[0] ] += 1
        except UnknownChrom:
            write_to_samout( r, "__no_feature" )
            empty += 1

    if not quiet:
        sys.stderr.write( "%d SAM %s processed.\n" % ( i, "alignments " if not pe_mode else "alignment pairs" ) )
         
    for fn in sorted( counts.keys() ):
        print "%s\t%d" % ( fn, counts[fn] )
    print "__no_feature\t%d" % empty
    print "__ambiguous\t%d" % ambiguous
    print "__too_low_aQual\t%d" % lowqual
    print "__not_aligned\t%d" % notaligned
    print "__alignment_not_unique\t%d" % nonunique
示例#12
0
def mapping_reads2shared_exons_introns(refGene_txt, bam_filename, minaqual,
                                       stranded, order, max_buffer_size):
    # initialise counters
    counts = {}
    counts['_empty'] = 0
    counts['_ambiguous'] = 0
    counts['_lowaqual'] = 0
    counts['_notaligned'] = 0
    counts['_ambiguous_readpair_position'] = 0

    # Read BAM file
    bam_reader = HTSeq.BAM_Reader(bam_filename)
    # CIGAR match characters (including alignment match, sequence match, and sequence mismatch
    cigar_char = ('M', '=', 'X')
    # (Refer to HTSeq-count)strand-associated
    stranded_boolean = stranded == 'yes' or stranded == 'reverse'
    reverse_boolean = stranded == 'reverse'

    def invert_strand(iv):
        iv2 = iv.copy()
        if iv2.strand == "+":
            iv2.strand = "-"
        elif iv2.strand == "-":
            iv2.strand = "+"
        else:
            raise ValueError("Illegal strand")
        return iv2

    sys.stdout.write(
        "Gene\tfeature\trank\tposition\tlength\tread_counts\tread_counts_norm\tcoverage(%)\n"
    )

    annot = collections.OrderedDict()
    for line in open(refGene_txt):
        gene_label, feature, rank, position, length = line.strip().split('\t')
        chrom, iv_str, strand = position.strip().split(':')
        start, end = map(int, iv_str.strip().split('-'))
        annot.setdefault(gene_label, []).append(
            (feature, int(rank), chrom, start, end, strand, int(length)))

    for gene_name in annot:
        gene_count = {}
        gas = HTSeq.GenomicArrayOfSets("auto", stranded=stranded_boolean)
        ga = HTSeq.GenomicArray("auto",
                                stranded=stranded_boolean,
                                typecode="i")
        cvg_list = []

        # Annotation
        for feature, rank, chrom, start, end, strand, length in annot[
                gene_name]:
            iv = HTSeq.GenomicInterval(chrom, start, end, strand)
            gas[iv] += (feature, rank)
            gene_count[(feature, rank)] = 0

        # 直接对bam_reader取iter有问题,作者说是pysam的bug导致的。修正:加fetch
        boundary_left, boundary_right = min(
            [i[3]
             for i in annot[gene_name]]), max([i[4] for i in annot[gene_name]])
        region_fetch = annot[gene_name][0][2] + ':' + str(
            int(boundary_left) - 500) + '-' + str(int(boundary_right) + 500)
        read_seq = bam_reader.fetch(region=region_fetch)

        # distinguish SE and PE mode:
        read_seq_iter = iter(bam_reader.fetch())
        one_read = next(read_seq_iter)
        pe_mode = one_read.paired_end

        if pe_mode:
            if order == 'name':
                read_seq = HTSeq.pair_SAM_alignments(read_seq)
            elif order == 'pos':
                read_seq = HTSeq.pair_SAM_alignments_with_buffer(
                    read_seq, max_buffer_size=max_buffer_size)
            else:
                raise ValueError("Illegal order name.")

        # Mapping
        for a in read_seq:
            if not pe_mode:
                if not a.aligned:
                    counts['_notaligned'] += 1
                    continue
                if a.optional_field('NH') > 1:
                    continue
                if a.aQual < minaqual:
                    counts['_lowaqual'] += 1
                    continue
                if not reverse_boolean:
                    iv_seq = (cigop.ref_iv for cigop in a.cigar
                              if cigop.type == "M" and cigop.size > 0)
                else:
                    iv_seq = (invert_strand(cigop.ref_iv) for cigop in a.cigar
                              if cigop.type in cigar_char and cigop.size > 0)
            # pe mode
            else:
                if ((a[0] and a[0].aQual < minaqual)
                        or (a[1] and a[1].aQual < minaqual)):
                    counts['_lowaqual'] += 1
                    continue
                if ((a[0] and a[0].optional_field('NH') > 1)
                        or (a[1] and a[1].optional_field('NH') > 1)):
                    continue
                if a[0] is not None and a[0].aligned:
                    if not reverse_boolean:
                        iv_seq = (
                            cigop.ref_iv for cigop in a[0].cigar
                            if cigop.type in cigar_char and cigop.size > 0)
                    else:
                        iv_seq = (
                            invert_strand(cigop.ref_iv) for cigop in a[0].cigar
                            if cigop.type in cigar_char and cigop.size > 0)
                else:
                    iv_seq = tuple()
                if a[1] is not None and a[1].aligned:
                    if not reverse_boolean:
                        iv_seq = itertools.chain(
                            iv_seq,
                            (invert_strand(cigop.ref_iv)
                             for cigop in a[1].cigar
                             if cigop.type in cigar_char and cigop.size > 0))
                    else:
                        iv_seq = itertools.chain(
                            iv_seq,
                            (cigop.ref_iv for cigop in a[1].cigar
                             if cigop.type in cigar_char and cigop.size > 0))

            feature_aligned = set()
            for iv in iv_seq:
                for iv2, val2 in gas[iv].steps():
                    feature_aligned |= val2
                    ga[iv] += 1  # for calculating coverage
            if len(feature_aligned) == 0:
                counts['_empty'] += 1
                continue
            # when mapping to intron, discard exons
            for f in [item for item in feature_aligned if item[0] == 'intron']:
                gene_count[f] += 1
            # when no mapping to intron, count all exons
            if 'intron' not in [x for x, y in feature_aligned]:
                for f in feature_aligned:
                    gene_count[f] += 1

        res = []
        for feature, rank, chrom, start, end, strand, length in annot[
                gene_name]:
            feature_count = gene_count[(feature, rank)]
            feature_count_norm = feature_count / length * 1000
            # Coverage calculation
            iv = HTSeq.GenomicInterval(chrom, start, end, strand)
            cvg_region = list(ga[iv])
            cvg = len(filter(lambda x: x > 0,
                             cvg_region)) / len(cvg_region) * 100
            res.append([
                feature, rank, chrom, start, end, strand, length,
                feature_count, feature_count_norm, cvg
            ])

        # Output
        for feature, rank, chrom, start, end, strand, length, feature_count, feature_count_norm, cvg in res:
            pos = "%s:%d-%d:%s" % (chrom, start, end, strand)
            sys.stdout.write('\t'.join(
                map(str, [
                    gene_name, feature, rank, pos, length, feature_count,
                    feature_count_norm, cvg
                ])) + '\n')

    for fn in counts.keys():
        sys.stderr.write('%s\t%d\n' % (fn, counts[fn]))
示例#13
0
def count_reads(features, counts, pe_mode, read_seq, order, stranded,
                overlap_mode, quiet, minaqual, write_to_samout):

    if pe_mode:
        if order == "name":
            read_seq = HTSeq.pair_SAM_alignments(read_seq)
        elif order == "pos":
            read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq)
        else:
            raise ValueError, "Illegal order specified."
    empty = 0
    ambiguous = 0
    notaligned = 0
    lowqual = 0
    nonunique = 0
    i = 0
    for r in read_seq:
        if i > 0 and i % 100000 == 0 and not quiet:
            sys.stderr.write("%d SAM alignment record%s processed.\n" %
                             (i, "s" if not pe_mode else " pairs"))

        i += 1
        if not pe_mode:
            if not r.aligned:
                notaligned += 1
                write_to_samout(r, "__not_aligned")
                continue
            try:
                if r.optional_field("NH") > 1:
                    nonunique += 1
                    write_to_samout(r, "__alignment_not_unique")
                    continue
            except KeyError:
                pass
            if r.aQual < minaqual:
                lowqual += 1
                write_to_samout(r, "__too_low_aQual")
                continue
            if stranded != "reverse":
                iv_seq = (co.ref_iv for co in r.cigar
                          if co.type == "M" and co.size > 0)
            else:
                iv_seq = (invert_strand(co.ref_iv) for co in r.cigar
                          if co.type == "M" and co.size > 0)
        else:
            if r[0] is not None and r[0].aligned:
                if stranded != "reverse":
                    iv_seq = (co.ref_iv for co in r[0].cigar
                              if co.type == "M" and co.size > 0)
                else:
                    iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                              if co.type == "M" and co.size > 0)
            else:
                iv_seq = tuple()
            if r[1] is not None and r[1].aligned:
                if stranded != "reverse":
                    iv_seq = itertools.chain(
                        iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar
                                 if co.type == "M" and co.size > 0))
                else:
                    iv_seq = itertools.chain(
                        iv_seq, (co.ref_iv for co in r[1].cigar
                                 if co.type == "M" and co.size > 0))
            else:
                if (r[0] is None) or not (r[0].aligned):
                    write_to_samout(r, "__not_aligned")
                    notaligned += 1
                    continue
            try:
                if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                         ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
                    nonunique += 1
                    write_to_samout(r, "__alignment_not_unique")
                    continue
            except KeyError:
                pass
            if (r[0] and r[0].aQual < minaqual) or (r[1]
                                                    and r[1].aQual < minaqual):
                lowqual += 1
                write_to_samout(r, "__too_low_aQual")
                continue

        try:
            if overlap_mode == "union":
                fs = set()
                for iv in iv_seq:
                    if iv.chrom not in features.chrom_vectors:
                        raise UnknownChrom
                    for iv2, fs2 in features[iv].steps():
                        fs = fs.union(fs2)
            elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
                fs = None
                for iv in iv_seq:
                    if iv.chrom not in features.chrom_vectors:
                        raise UnknownChrom
                    for iv2, fs2 in features[iv].steps():
                        if len(fs2
                               ) > 0 or overlap_mode == "intersection-strict":
                            if fs is None:
                                fs = fs2.copy()
                            else:
                                fs = fs.intersection(fs2)
            else:
                sys.exit("Illegal overlap mode.")
            if fs is None or len(fs) == 0:
                write_to_samout(r, "__no_feature")
                empty += 1
            elif len(fs) > 1:
                write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]")
                ambiguous += 1
            else:
                write_to_samout(r, list(fs)[0])
                counts[list(fs)[0]] += 1
        except UnknownChrom:
            write_to_samout(r, "__no_feature")
            empty += 1

    if not quiet:
        sys.stderr.write(
            "%d SAM %s processed.\n" %
            (i, "alignments " if not pe_mode else "alignment pairs"))

    for fn in sorted(counts.keys()):
        print "%s\t%d" % (fn, counts[fn])
    print "__no_feature\t%d" % empty
    print "__ambiguous\t%d" % ambiguous
    print "__too_low_aQual\t%d" % lowqual
    print "__not_aligned\t%d" % notaligned
    print "__alignment_not_unique\t%d" % nonunique
示例#14
0
def count_reads_in_features(sam_filenames, gff_filename, samtype, order,
                            max_buffer_size, stranded, overlap_mode,
                            multimapped_mode, secondary_alignment_mode,
                            supplementary_alignment_mode, feature_type,
                            id_attribute, additional_attributes, quiet,
                            minaqual, samouts):
    def exists(obj, chain):
        _key = chain.pop(0)
        if _key in obj:
            return exists(obj[_key], chain) if chain else obj[_key]

    def check_overlapped_exons_and_calc_sum(gene):

        rightmost_value = gene["exons"][0][1]
        start = gene["exons"][0][0]
        new_exons = []
        total = rightmost_value - start
        for interval in gene["exons"]:

            if (interval[0] <= rightmost_value
                    and interval[1] >= rightmost_value):

                total += (interval[1] - rightmost_value)
                rightmost_value = interval[1]

            elif (interval[0] > rightmost_value):
                total += (interval[1] - interval[0])
                new_exons.append([start, rightmost_value
                                  ])  #add previous extended interval to result

                start = interval[0]
                rightmost_value = interval[1]

        new_exons.append([start, rightmost_value])

        gene["exons"] = new_exons
        gene["total_sum_of_exons"] = total

    def check_and_count_points_coverage(gene_id, first_read, second_read):

        # определить какую из точек пересекает
        # вычесть из каждой координаты координату начала гена!
        if (first_read is None or second_read is None):
            return

        gene_begin = genes_exons[gene_id]["gene_begin"]

        fstart = first_read.iv.start - gene_begin
        fend = first_read.iv.end - gene_begin
        sstart = second_read.iv.start - gene_begin
        send = second_read.iv.end - gene_begin

        if (first_read.proper_pair == False
                or second_read.proper_pair == False):
            return

        if (fend < sstart and fstart < fend and sstart < send):
            check(gene_id, fstart, fend)
            check(gene_id, sstart, send)

        elif (send < fstart and fstart < fend and sstart < send):
            check(gene_id, fstart, fend)
            check(gene_id, sstart, send)

        elif (fstart < fend and sstart < send and sstart >= fstart
              and send >= fend and sstart <= fend):
            check(gene_id, fstart, send)

        elif (fstart < fend and sstart < send and sstart <= fstart
              and send >= fstart and send <= fend):
            check(gene_id, sstart, fend)

        elif (fstart < sstart and send < fend):
            check(gene_id, fstart, fend)
        elif (sstart < fstart and fend < send):
            check(gene_id, sstart, send)

    def check(gene_id, start, end):
        total = 100
        half = total / 2
        left_interval = right_interval = half

        try:
            i = 0
            while (left_interval >= 10):
                if (i > 10):
                    raise ValueError('Out of boundaries\n')

                if (exists(
                        genes_coverage_in_points,
                    [gene_id, half
                     ]) == None):  # если точки нет то ищем ближаишую слева
                    # half = math.ceil(half)
                    half = int(math.floor(half / 10) * 10)
                    point = genes_coverage_in_points[gene_id][half]["point"]
                    right_interval += 5
                    left_interval -= 5

                else:  # если точка есть,
                    point = genes_coverage_in_points[gene_id][half]["point"]

                if (point < start):  # слева точка от рида, рид справой строны

                    half = half + (right_interval / 2)
                    left_interval = right_interval = right_interval / 2

                elif (point > end):  # точка справа от рида, рид слевой стороны

                    half = half - (left_interval / 2)
                    left_interval = right_interval = left_interval / 2

                elif (point > start and point < end):  # пересекает
                    genes_coverage_in_points[gene_id][half]["coverage"] += 1
                    return
                i += 1

        except:
            sys.stderr.write("Out of boundaries\n")

    def check2(gene_id, start, end):
        #gene_begin = genes_exons[gene_id]["gene_begin"]
        for i in range(0, 100, 10):

            point = genes_coverage_in_points[gene_id][i]["point"]

            if (start < point and point < end):
                genes_coverage_in_points[gene_id][i]["coverage"] += 1
                return

    def clear_all_cov_points():
        for gene_id, gene in genes_coverage_in_points.iteritems():

            for k, val in gene.iteritems():
                val["coverage"] = 0

    def plot_gene_coverage():
        sys.stderr.write("ENSG00000000003.10 genes on: " + str(test_n[0]) +
                         "\n")
        x = []
        y = []

        i = 0
        for k, val in enumerate(
                list(cvg[HTSeq.GenomicInterval("chrX", test_first_exon_start,
                                               test_last_exon_end)])):
            x.append(i)
            y.append(val)
            i += 1
        plt.plot(x, y)
        plt.show()
        """
         iv = HTSeq.GenomicInterval("chr3", 100, 200, "+")
        cvg[iv] += 1
        iv = HTSeq.GenomicInterval("chr3", 150, 250, "-")
        cvg[iv] += 1
        

        
        """

    if samouts != "":
        if len(samouts) != len(sam_filenames):
            raise ValueError(
                'Select the same number of SAM input and output files')
        # Try to open samout files early in case any of them has issues
        for samout in samouts:
            with open(samout, 'w'):
                pass

    # Try to open samfiles to fail early in case any of them is not there
    if (len(sam_filenames) != 1) or (sam_filenames[0] != '-'):
        for sam_filename in sam_filenames:
            with open(sam_filename):
                pass

    # CIGAR match characters (including alignment match, sequence match, and
    # sequence mismatch
    com = ('M', '=', 'X')

    features = HTSeq.GenomicArrayOfSets("auto", stranded != "no")
    gff = HTSeq.GFF_Reader(gff_filename)

    #genes_coverage_in_points = {}
    genes_coverage_in_points = defaultdict(dict)
    #genes_exons = {}

    genes_exons = defaultdict(dict)
    #cvg = HTSeq.GenomicArray("auto", stranded != "no")

    test_n = [0]
    i = 0

    try:
        for f in gff:
            if f.type == feature_type:
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    raise ValueError(
                        "Feature %s does not contain a '%s' attribute" %
                        (f.name, id_attribute))
                if stranded != "no" and f.iv.strand == ".":
                    raise ValueError(
                        "Feature %s at %s does not have strand information but you are "
                        "running htseq-count in stranded mode. Use '--stranded=no'."
                        % (f.name, f.iv))

                features[f.iv] += feature_id

                #counts[f.attr[id_attribute]] = 0

                #экзоны не в порядке сортировки! координат
                #ген - граница экзона
                #здесь будут все интервалы и сумма всех интервалов
                gene_id = feature_id  #f.attr[id_attribute]

                if (exists(genes_exons, [gene_id]) == None):
                    #координата первого экзона

                    genes_exons[gene_id] = {
                        "total_sum_of_exons": 0,
                        "total_aligned_reads": 0,
                        "gene_begin": 0,
                        "exons": list([[f.iv.start, f.iv.end]])
                    }

                else:

                    genes_exons[gene_id]["exons"].append(
                        [f.iv.start, f.iv.end])

                #10 точек для гена для которых будем считать покрытие(интроны вычтем)

            i += 1
            if i % 100000 == 0 and not quiet:
                sys.stderr.write("%d GFF lines processed.\n" % i)

    except:
        sys.stderr.write("Error occured when processing GFF file (%s):\n" %
                         gff.get_line_number_string())
        raise

    if not quiet:
        sys.stderr.write("%d GFF lines processed.\n" % i)

    if len(genes_exons) == 0:
        sys.stderr.write("Warning: No features of type '%s' found.\n" %
                         feature_type)

    #проход по всем генам и внутри каждого сортируем по первой координате экзона
    #в конце сортировки каждого гена назначаем крайнюю координату начала гена(первый экзон)
    #пересекающиеся экзоны надо склеивать и расширять границы
    #после склеивания будем получать сумму экзонов total_sum_of_exons, т.е. мы получим участки непокрытые ни на одном стренде

    for gene_id, gene in genes_exons.iteritems():

        gene["exons"].sort()  #by first member
        gene["gene_begin"] = gene["exons"][0][0]

        #слить все пересекающиеся экзоны и одновременно посчитать сумму длин без полученных промежутков
        check_overlapped_exons_and_calc_sum(gene)

        total = gene["total_sum_of_exons"]  # длина всех экзонов

        for ten_interval in xrange(0, 100, 10):
            point = (total * ten_interval
                     ) / 100  #точка в абсолютном исчислении % от длины экзона
            prev_exon_end = 0

            for exon_key, exon in enumerate(gene["exons"]):

                #prev_exon_length + exon.start +
                point += (exon[0] - prev_exon_end)  #длина интрона

                if (point < exon[1]):  #точка конца экзона
                    #пишем точку в конечный массив
                    genes_coverage_in_points[gene_id][ten_interval] = {
                        "point": point - gene["gene_begin"],
                        "coverage": 0
                    }

                    break  # переход на следующую точку 10%
                else:
                    #длину экзона не уложившегося записываем
                    #prev_exon_length += exon.end - exon.start
                    prev_exon_end = exon[1]

    if samtype == "sam":
        SAM_or_BAM_Reader = HTSeq.SAM_Reader
    elif samtype == "bam":
        SAM_or_BAM_Reader = HTSeq.BAM_Reader
    else:
        raise ValueError("Unknown input format %s specified." % samtype)

    sample = 0

    colors = ["red", "blue", "green", "yellow"]
    handlers = []
    sys.stderr.write(strftime("%Y-%m-%d %H:%M:%S", gmtime()) + "\n")
    for isam, (sam_filename) in enumerate(sam_filenames):

        total_of_reads_in_sample = 0

        if samouts != '':
            samoutfile = open(samouts[isam], 'w')
        else:
            samoutfile = None

        try:
            if sam_filename != "-":
                read_seq_file = SAM_or_BAM_Reader(sam_filename)
                read_seq = read_seq_file
                first_read = next(iter(read_seq))
            else:
                read_seq_file = SAM_or_BAM_Reader(sys.stdin)
                read_seq_iter = iter(read_seq_file)
                first_read = next(read_seq_iter)
                read_seq = itertools.chain([first_read], read_seq_iter)
            pe_mode = first_read.paired_end
        except:
            sys.stderr.write(
                "Error occured when reading beginning of SAM/BAM file.\n")
            raise

        try:
            if pe_mode:
                if order == "name":
                    read_seq = HTSeq.pair_SAM_alignments(read_seq)
                elif order == "pos":
                    read_seq = HTSeq.pair_SAM_alignments_with_buffer(
                        read_seq, max_buffer_size=max_buffer_size)
                else:
                    raise ValueError("Illegal order specified.")

            notaligned = 0
            lowqual = 0

            i = 0
            for r in read_seq:
                #TODO 'NoneType' object has no attribute 'iv' raised in plot_coverage.py:169]
                total_of_reads_in_sample += 1
                if i > 0 and i % 100000 == 0 and not quiet:
                    sys.stderr.write("%d SAM alignment record%s processed.\n" %
                                     (i, "s" if not pe_mode else " pairs"))
                    sys.stderr.write(
                        strftime("%Y-%m-%d %H:%M:%S", gmtime()) + "\n")
                i += 1
                if not pe_mode:
                    if not r.aligned:
                        #notaligned += 1
                        #write_to_samout(r, "__not_aligned", samoutfile)
                        continue
                    if ((secondary_alignment_mode == 'ignore')
                            and r.not_primary_alignment):
                        continue
                    if ((supplementary_alignment_mode == 'ignore')
                            and r.supplementary):
                        continue
                    try:
                        if r.optional_field("NH") > 1:
                            #nonunique += 1
                            #write_to_samout(r, "__alignment_not_unique", samoutfile)
                            if multimapped_mode == 'none':
                                continue
                    except KeyError:
                        pass
                    if r.aQual < minaqual:
                        lowqual += 1
                        #write_to_samout(r, "__too_low_aQual", samoutfile)
                        continue
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r.cigar
                                  if co.type == "M" and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv) for co in r.cigar
                                  if (co.type in com and co.size > 0))
                else:
                    if r[0] is not None and r[0].aligned:
                        if stranded != "reverse":
                            iv_seq = (co.ref_iv for co in r[0].cigar
                                      if co.type in com and co.size > 0)
                        else:
                            iv_seq = (invert_strand(co.ref_iv)
                                      for co in r[0].cigar
                                      if co.type in com and co.size > 0)
                    else:
                        iv_seq = tuple()
                    if r[1] is not None and r[1].aligned:
                        if stranded != "reverse":
                            iv_seq = itertools.chain(
                                iv_seq, (invert_strand(co.ref_iv)
                                         for co in r[1].cigar
                                         if co.type in com and co.size > 0))
                        else:
                            iv_seq = itertools.chain(
                                iv_seq, (co.ref_iv for co in r[1].cigar
                                         if co.type in com and co.size > 0))
                    else:
                        if (r[0] is None) or not (r[0].aligned):
                            #write_to_samout(r, "__not_aligned", samoutfile)
                            #notaligned += 1
                            continue
                    if secondary_alignment_mode == 'ignore':
                        if (r[0] is not None) and r[0].not_primary_alignment:
                            continue
                        elif (r[1] is not None) and r[1].not_primary_alignment:
                            continue
                    if supplementary_alignment_mode == 'ignore':
                        if (r[0] is not None) and r[0].supplementary:
                            continue
                        elif (r[1] is not None) and r[1].supplementary:
                            continue
                    try:
                        if ((r[0] is not None
                             and r[0].optional_field("NH") > 1)
                                or (r[1] is not None
                                    and r[1].optional_field("NH") > 1)):
                            #nonunique += 1
                            #write_to_samout(r, "__alignment_not_unique", samoutfile)
                            if multimapped_mode == 'none':
                                continue
                    except KeyError:
                        pass
                    if ((r[0] and r[0].aQual < minaqual)
                            or (r[1] and r[1].aQual < minaqual)):
                        lowqual += 1
                        #write_to_samout(r, "__too_low_aQual", samoutfile)
                        continue

                try:
                    if overlap_mode == "union":
                        fs = set()
                        for iv in iv_seq:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                fs = fs.union(fs2)
                    elif overlap_mode in ("intersection-strict",
                                          "intersection-nonempty"):
                        fs = None
                        for iv in iv_seq:
                            if iv.chrom not in features.chrom_vectors:
                                continue
                                #raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                if ((len(fs2) > 0) or
                                    (overlap_mode == "intersection-strict")):
                                    if fs is None:
                                        fs = fs2.copy()
                                    else:
                                        fs = fs.intersection(fs2)
                    else:
                        sys.exit("Illegal overlap mode.")

                    if fs is not None and len(fs) > 0:
                        if multimapped_mode == 'none':
                            if len(fs) == 1:
                                #counts[list(fs)[0]] += 1
                                #read mapped only for one exon, (all cigar parts of both reads in pair mapped on one gene, but may be for several exons)
                                #we can take this read into account of analysis
                                #they must come in sorted order by coordinate!
                                #this is one unit of analysis. save it in memory and go throught it

                                gene_name = list(fs)[0]  # - имя гена

                                genes_exons[gene_name][
                                    "total_aligned_reads"] += 1

                                #if (total_of_reads_in_sample==100000):
                                #   break

                                check_and_count_points_coverage(
                                    gene_name, r[0], r[1])
                            """
                            elif multimapped_mode == 'all':
                                for fsi in list(fs):
                                    #counts[fsi] += 1 
                            """
                        else:
                            sys.exit("Illegal multimap mode.")

                except UnknownChrom:
                    #write_to_samout(r, "__no_feature", samoutfile)
                    #empty += 1
                    raise

        except:
            sys.stderr.write(
                "Error occured when processing SAM input (%s):\n" %
                read_seq_file.get_line_number_string())
            raise

        if not quiet:
            sys.stderr.write(
                "%d SAM %s processed.\n" %
                (i, "alignments " if not pe_mode else "alignment pairs"))

        if samoutfile is not None:
            samoutfile.close()

        #сохранить данные в таблицы чтобы работать с ними как угодно потом!

        outfile = open(
            '/home/kirill/bi/transcript/' + str(sample) + '_dict.txt', 'w')
        outfile.write("total_of_reads_in_sample" + '\t' +
                      str(total_of_reads_in_sample) + '\n')
        for gene_id, gene in genes_coverage_in_points.iteritems():

            outfile.write(
                str(gene_id) + '\t' +
                str(genes_exons[gene_id]["total_aligned_reads"]) + '\t' +
                str(genes_exons[gene_id]["total_sum_of_exons"]) + '\n')

            outfile.write(str(gene_id) + '\t')
            [
                outfile.write(str(val["coverage"]) + '\t')
                for k, val in gene.iteritems()
            ]
            outfile.write('\n')

        outfile.close()

        #############test################

        #plot_gene_coverage()

        ################################

        #1. получить % от числа ридов картированных на ген в конкретной точке(сумма всех % на 10 точках = 100) - число ридов картированных на ген будем записывать в массив(это бывший массиыв count)
        #2 для каждой точки делим полученный процент на длину конкретного гена (total_sum of exons)
        #3. для каждой точки делим величину на общее число ридов в образце
        #4. deviance - min - max всех значений? точка на графике среднее между ними

        CalcCoverage.do_coverage(genes_coverage_in_points, genes_exons,
                                 total_of_reads_in_sample, colors, sample,
                                 handlers)

        sample += 1

        #обнуление точек покрытия
        clear_all_cov_points()

    plt.legend(handlers, ['Sample ' + str(v) for v in range(0, sample, 1)])
    plt.title('Positions relative coverege')
    plt.xlabel('5` -> 3` positions, %')
    plt.ylabel('relative coverage')
    plt.grid(True)

    plt.savefig('/home/kirill/bi/transcript/covarage.png')
    plt.show()
    plt.close()
示例#15
0
def count_reads_in_features( sam_filename, gff_filename, samtype, order, stranded, 
      overlap_mode, feature_type, id_attribute, quiet, minaqual, samout ):
      
   def write_to_samout( r, assignment ):
      if samoutfile is None:
         return
      if not pe_mode:
         r = (r,)
      for read in r:
         if read is not None:
            samoutfile.write( read.original_sam_line.rstrip() + 
               "\tXF:Z:" + assignment + "\n" )
      

   if samout != "":
      samoutfile = open( samout, "w" )
   else:
      samoutfile = None
      
   features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" )     
   counts = {}

   # Try to open samfile to fail early in case it is not there
   if sam_filename != "-":
      open( sam_filename ).close()
      
   gff = HTSeq.GFF_Reader( gff_filename )   
   i = 0
   try:
      for f in gff:
         if f.type == feature_type:
            try:
               feature_id = f.attr[ id_attribute ]
            except KeyError:
               raise ValueError, ( "Feature %s does not contain a '%s' attribute" % 
                  ( f.name, id_attribute ) )
            if stranded != "no" and f.iv.strand == ".":
               raise ValueError, ( "Feature %s at %s does not have strand information but you are "
                  "running htseq-count in stranded mode. Use '--stranded=no'." % 
                  ( f.name, f.iv ) )
            features[ f.iv ] += feature_id
            counts[ f.attr[ id_attribute ] ] = 0
         i += 1
         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d GFF lines processed.\n" % i )
   except:
      sys.stderr.write( "Error occured when processing GFF file (%s):\n" % gff.get_line_number_string() )
      raise
      
   if not quiet:
      sys.stderr.write( "%d GFF lines processed.\n" % i )
      
   if len( counts ) == 0:
      sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type )
   
   if samtype == "sam":
      SAM_or_BAM_Reader = HTSeq.SAM_Reader
   elif samtype == "bam":
      SAM_or_BAM_Reader = HTSeq.BAM_Reader
   else:
      raise ValueError, "Unknown input format %s specified." % samtype

   try:
      if sam_filename != "-":
         read_seq_file = SAM_or_BAM_Reader( sam_filename )
         read_seq = read_seq_file
         first_read = iter(read_seq).next()
      else:
         read_seq_file = SAM_or_BAM_Reader( sys.stdin )
         read_seq_iter = iter( read_seq_file )
         first_read = read_seq_iter.next()
         read_seq = itertools.chain( [ first_read ], read_seq_iter )
      pe_mode = first_read.paired_end
   except:
      sys.stderr.write( "Error occured when reading beginning of SAM/BAM file.\n" )
      raise

   try:
      if pe_mode:
         if order == "name":
            read_seq = HTSeq.pair_SAM_alignments( read_seq )
         elif order == "pos":
            read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq )
         else:
            raise ValueError, "Illegal order specified."
      empty = 0
      ambiguous = 0
      notaligned = 0
      lowqual = 0
      nonunique = 0
      i = 0   
      for r in read_seq:
         if i > 0 and i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d SAM alignment record%s processed.\n" % ( i, "s" if not pe_mode else " pairs" ) )

         i += 1
         if not pe_mode:
            if not r.aligned:
               notaligned += 1
               write_to_samout( r, "__not_aligned" )
               continue
            try:
               if r.optional_field( "NH" ) > 1:
                  nonunique += 1
                  write_to_samout( r, "__alignment_not_unique" )
                  continue
            except KeyError:
               pass
            if r.aQual < minaqual:
               lowqual += 1
               write_to_samout( r, "__too_low_aQual" )
               continue
            if stranded != "reverse":
               iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 )
            else:
               iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 )            
         else:
            if r[0] is not None and r[0].aligned:
               if stranded != "reverse":
                  iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 )
               else:
                  iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 )
            else:
               iv_seq = tuple()
            if r[1] is not None and r[1].aligned:            
               if stranded != "reverse":
                  iv_seq = itertools.chain( iv_seq, 
                     ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
               else:
                  iv_seq = itertools.chain( iv_seq, 
                     ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
            else:
               if ( r[0] is None ) or not ( r[0].aligned ):
                  write_to_samout( r, "__not_aligned" )
                  notaligned += 1
                  continue         
            try:
               if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                     ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
                  nonunique += 1
                  write_to_samout( r, "__alignment_not_unique" )
                  continue
            except KeyError:
               pass
            if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
               lowqual += 1
               write_to_samout( r, "__too_low_aQual" )
               continue         
         
         try:
            if overlap_mode == "union":
               fs = set()
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     fs = fs.union( fs2 )
            elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
               fs = None
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     if len(fs2) > 0 or overlap_mode == "intersection-strict":
                        if fs is None:
                           fs = fs2.copy()
                        else:
                           fs = fs.intersection( fs2 )
            else:
               sys.exit( "Illegal overlap mode." )
            if fs is None or len( fs ) == 0:
               write_to_samout( r, "__no_feature" )
               empty += 1
            elif len( fs ) > 1:
               write_to_samout( r, "__ambiguous[" + '+'.join( fs ) + "]" )
               ambiguous += 1
            else:
               write_to_samout( r, list(fs)[0] )
               counts[ list(fs)[0] ] += 1
         except UnknownChrom:
            write_to_samout( r, "__no_feature" )
            empty += 1

   except:
      sys.stderr.write( "Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string() )
      raise

   if not quiet:
      sys.stderr.write( "%d SAM %s processed.\n" % ( i, "alignments " if not pe_mode else "alignment pairs" ) )
         
   if samoutfile is not None:
      samoutfile.close()

   for fn in sorted( counts.keys() ):
      print "%s\t%d" % ( fn, counts[fn] )
   print "__no_feature\t%d" % empty
   print "__ambiguous\t%d" % ambiguous
   print "__too_low_aQual\t%d" % lowqual
   print "__not_aligned\t%d" % notaligned
   print "__alignment_not_unique\t%d" % nonunique
示例#16
0
def count_reads_with_barcodes(
        sam_filename,
        features,
        feature_attr,
        order,
        max_buffer_size,
        stranded,
        overlap_mode,
        multimapped_mode,
        secondary_alignment_mode,
        supplementary_alignment_mode,
        feature_type,
        id_attribute,
        additional_attributes,
        quiet,
        minaqual,
        samout_format,
        samout_filename,
        cb_tag,
        ub_tag,
        ):

    def write_to_samout(r, assignment, samoutfile, template=None):
        if samoutfile is None:
            return
        if not pe_mode:
            r = (r,)
        for read in r:
            if read is not None:
                read.optional_fields.append(('XF', assignment))
                if samout_format in ('SAM', 'sam'):
                    samoutfile.write(read.get_sam_line() + "\n")
                else:
                    samoutfile.write(read.to_pysam_AlignedSegment(template))

    def identify_barcodes(r):
        '''Identify barcode from the read or pair (both must have the same)'''
        if not pe_mode:
            r = (r,)
        # cell, UMI
        barcodes = [None, None]
        nbar = 0
        for read in r:
            if read is not None:
                for tag, val in read.optional_fields:
                    if tag == cb_tag:
                        barcodes[0] = val
                        nbar += 1
                        if nbar == 2:
                            return barcodes
                    elif tag == ub_tag:
                        barcodes[1] = val
                        nbar += 1
                        if nbar == 2:
                            return barcodes
        return barcodes

    try:
        if sam_filename == "-":
            read_seq_file = HTSeq.BAM_Reader(sys.stdin)
        else:
            read_seq_file = HTSeq.BAM_Reader(sam_filename)

        # Get template for output BAM
        if samout_filename is None:
            template = None
            samoutfile = None
        elif samout_format in ('bam', 'BAM'):
            template = read_seq_file.get_template()
            samoutfile = pysam.AlignmentFile(
                    samout_filename, 'wb',
                    template=template,
                    )
        else:
            template = None
            samoutfile = open(samout_filename, 'w')

        read_seq_iter = iter(read_seq_file)
        # Catch empty BAM files
        try:
            first_read = next(read_seq_iter)
            pe_mode = first_read.paired_end
        # FIXME: catchall can hide subtle bugs
        except:
            first_read = None
            pe_mode = False
        if first_read is not None:
            read_seq = itertools.chain([first_read], read_seq_iter)
        else:
            read_seq = []
    except:
        sys.stderr.write(
            "Error occured when reading beginning of SAM/BAM file.\n")
        raise

    # CIGAR match characters (including alignment match, sequence match, and
    # sequence mismatch
    com = ('M', '=', 'X')

    try:
        if pe_mode:
            if ((supplementary_alignment_mode == 'ignore') and
               (secondary_alignment_mode == 'ignore')):
                primary_only = True
            else:
                primary_only = False
            if order == "name":
                read_seq = HTSeq.pair_SAM_alignments(
                        read_seq,
                        primary_only=primary_only)
            elif order == "pos":
                read_seq = HTSeq.pair_SAM_alignments_with_buffer(
                        read_seq,
                        max_buffer_size=max_buffer_size,
                        primary_only=primary_only)
            else:
                raise ValueError("Illegal order specified.")

        # The nesting is cell barcode, UMI, feature
        counts = defaultdict(lambda: defaultdict(Counter))
        i = 0
        for r in read_seq:
            if i > 0 and i % 100000 == 0 and not quiet:
                sys.stderr.write(
                    "%d alignment record%s processed.\n" %
                    (i, "s" if not pe_mode else " pairs"))
                sys.stderr.flush()

            i += 1

            cb, ub = identify_barcodes(r)

            if not pe_mode:
                if not r.aligned:
                    counts[cb][ub]['__not_aligned'] += 1
                    write_to_samout(
                            r, "__not_aligned", samoutfile,
                            template)
                    continue
                if ((secondary_alignment_mode == 'ignore') and
                   r.not_primary_alignment):
                    continue
                if ((supplementary_alignment_mode == 'ignore') and
                   r.supplementary):
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        counts[cb][ub]['__alignment_not_unique'] += 1
                        write_to_samout(
                                r,
                                "__alignment_not_unique",
                                samoutfile,
                                template)
                        if multimapped_mode == 'none':
                            continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    counts[cb][ub]['__too_low_aQual'] += 1
                    write_to_samout(
                            r, "__too_low_aQual", samoutfile,
                            template)
                    continue
                if stranded != "reverse":
                    iv_seq = (co.ref_iv for co in r.cigar if co.type in com
                              and co.size > 0)
                else:
                    iv_seq = (invert_strand(co.ref_iv)
                              for co in r.cigar if (co.type in com and
                                                    co.size > 0))
            else:
                if r[0] is not None and r[0].aligned:
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r[0].cigar
                                  if co.type in com and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                                  if co.type in com and co.size > 0)
                else:
                    iv_seq = tuple()
                if r[1] is not None and r[1].aligned:
                    if stranded != "reverse":
                        iv_seq = itertools.chain(
                                iv_seq,
                                (invert_strand(co.ref_iv) for co in r[1].cigar
                                if co.type in com and co.size > 0))
                    else:
                        iv_seq = itertools.chain(
                                iv_seq,
                                (co.ref_iv for co in r[1].cigar
                                 if co.type in com and co.size > 0))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        write_to_samout(
                                r, "__not_aligned", samoutfile,
                                template)
                        counts[cb][ub]['__not_aligned'] += 1
                        continue
                if secondary_alignment_mode == 'ignore':
                    if (r[0] is not None) and r[0].not_primary_alignment:
                        continue
                    elif (r[1] is not None) and r[1].not_primary_alignment:
                        continue
                if supplementary_alignment_mode == 'ignore':
                    if (r[0] is not None) and r[0].supplementary:
                        continue
                    elif (r[1] is not None) and r[1].supplementary:
                        continue
                try:
                    if ((r[0] is not None and r[0].optional_field("NH") > 1) or
                       (r[1] is not None and r[1].optional_field("NH") > 1)):
                        write_to_samout(
                                r, "__alignment_not_unique", samoutfile,
                                template)
                        counts[cb][ub]['__alignment_not_unique'] += 1
                        if multimapped_mode == 'none':
                            continue
                except KeyError:
                    pass
                if ((r[0] and r[0].aQual < minaqual) or
                   (r[1] and r[1].aQual < minaqual)):
                    write_to_samout(
                            r, "__too_low_aQual", samoutfile,
                            template)
                    counts[cb][ub]['__too_low_aQual'] += 1
                    continue

            try:
                if overlap_mode == "union":
                    fs = set()
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            fs = fs.union(fs2)
                elif overlap_mode in ("intersection-strict",
                                      "intersection-nonempty"):
                    fs = None
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            if ((len(fs2) > 0) or
                               (overlap_mode == "intersection-strict")):
                                if fs is None:
                                    fs = fs2.copy()
                                else:
                                    fs = fs.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")

                if fs is None or len(fs) == 0:
                    write_to_samout(
                            r, "__no_feature", samoutfile,
                            template)
                    counts[cb][ub]['__no_feature'] += 1
                elif len(fs) > 1:
                    write_to_samout(
                            r, "__ambiguous[" + '+'.join(fs) + "]",
                            samoutfile,
                            template)
                    counts[cb][ub]['__ambiguous'] += 1
                else:
                    write_to_samout(
                            r, list(fs)[0], samoutfile,
                            template)

                if fs is not None and len(fs) > 0:
                    if multimapped_mode == 'none':
                        if len(fs) == 1:
                            counts[cb][ub][list(fs)[0]] += 1
                    elif multimapped_mode == 'all':
                        for fsi in list(fs):
                            counts[cb][ub][fsi] += 1
                    else:
                        sys.exit("Illegal multimap mode.")


            except UnknownChrom:
                write_to_samout(
                        r, "__no_feature", samoutfile,
                        template)
                counts[cb][ub]['__no_feature'] += 1

    except:
        sys.stderr.write(
            "Error occured when processing input (%s):\n" %
            (read_seq_file.get_line_number_string()))
        raise

    if not quiet:
        sys.stderr.write(
            "%d %s processed.\n" %
            (i, "alignments " if not pe_mode else "alignment pairs"))
        sys.stderr.flush()

    if samoutfile is not None:
        samoutfile.close()

    # Get rid of UMI by majority rule
    cbs = sorted(counts.keys())
    counts_noumi = {}
    for cb in cbs:
        counts_cell = Counter()
        for ub, udic in counts.pop(cb).items():
            # In case of a tie, do not increment either feature
            top = udic.most_common(2)
            if (len(top) == 2) and (top[0][1] == top[1][1]):
                continue
            counts_cell[top[0][0]] += 1
        counts_noumi[cb] = counts_cell

    return {
        'cell_barcodes': cbs,
        'counts': counts_noumi,
        }
示例#17
0
def main():
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        'alignment_file',
        metavar='in.aln',
        help="input alignment file in SAM or BAM format. Use '-' to indicate "
        "that input should be taken from standard input (stdin)")
    parser.add_argument(
        'feature_file',
        metavar='in.gff3',
        help="input feature annotation file in GFF3 format. Use '-' to indicate "
        "that input should be taken from standard input (stdin)")
    parser.add_argument(
        '-m',
        '--mapping',
        metavar='in.json',
        dest='map_files',
        action=ParseSeparator,
        sep=',',
        help="input one or more relational databases, in JSON format, "
        "containing features mapped to feature categories, such as genes "
        "to gene families or exons to genes. Abundance estimates for the "
        "given feature category will be reported in place of features. "
        "Multiple input files can be provided by separating them with a "
        "comma and no spaces")
    parser.add_argument(
        '-c',
        '--category',
        metavar='FIELD',
        dest='category',
        help="field in the relational database representing how features "
        "are categorized. WARNING: if the value type of the selected field "
        "is a list, then the category abundance totals can be greater than "
        "the feature abundance totals")
    gff_group = parser.add_argument_group('GFF3 arguments')
    gff_group.add_argument(
        '-t',
        '--type',
        metavar='TYPE',
        dest='ftype',
        default='CDS',
        help="feature type (3rd column in GFF file) to estimate abundance for "
        "[default: CDS]. All features of other type will be ignored")
    gff_group.add_argument(
        '-a',
        '--attr',
        metavar='ATTRIBUTE',
        default="Name",
        help="GFF attribute to use as the ID for the calculated abundances "
        "[default: 'Name']. This value will also be used as the search "
        "ID in the relational database, if provided")
    aln_group = parser.add_argument_group('SAM/BAM arguments')
    aln_group.add_argument(
        '-f',
        '--format',
        metavar='FORMAT',
        dest='aformat',
        choices=['bam', 'sam'],
        default='bam',
        help="input alignment file format [default: bam]. Options are 'sam' "
        "or 'bam'")
    aln_group.add_argument(
        '-q',
        '--qual',
        metavar='THRESH',
        dest='minqual',
        type=int,
        default=2,
        help="skip all reads with alignment quality lower than the threshold "
        "[default: 2]")
    aln_group.add_argument(
        '-s',
        '--sorting',
        metavar='ORDER',
        dest='order',
        choices=["position", "name"],
        default='position',
        help="alignment file sorting scheme. Options are 'position' and "
        "'name' [default: position]. Alignments must be pre-sorted "
        "either by position/coordinates or by read name. This option "
        "will be ignored for single-end reads")
    aln_group.add_argument(
        '-b',
        '--buffer',
        metavar='BYTES',
        dest='buffer_size',
        type=int,
        default=3145728,
        help="buffer size for paired reads in the alignment file if sorted by "
        "position [default: 3145728 (3GB)]. This value should be "
        "increased if memory issues are encountered")
    count_group = parser.add_argument_group('quantification arguments')
    count_group.add_argument(
        '-e',
        '--mode',
        metavar='MODE',
        choices=["union", "intersection-strict", "intersection-nonempty"],
        default="union",
        help="mode for handling different alignment scenarios. Options are "
        "'union', 'intersection-strict', and 'intersection-nonempty' "
        "[default: union]. The modes will count alignments differently "
        "depending on whether a read/pair overlaps more than one feature "
        "or only partially aligns to a single feature. The most "
        "inclusive mode is 'union' when given with the nonunique flag, "
        "and the least inclusive is 'intersection-strict'")
    count_group.add_argument(
        '-u',
        '--units',
        metavar='UNITS',
        dest='norm',
        action=ParseSeparator,
        sep=',',
        default='counts',
        help="comma-separated list of units to output abundance estimates in "
        "[default: counts]. Options are 'counts', 'fpk' (fragments per "
        "kilobase of feature), 'fpkm' (fragements per kilobase of "
        "feature per million mapped fragments), 'tpm' "
        "(transcripts/fragments per million), 'prop', and 'custom'. If "
        "other than 'counts', features will be normalized by recruitment "
        "length, which will be calculated from the start and end fields "
        "of the GFF3 file. This is the sole normalization method used "
        "when transforming counts to FPK, and is useful to correct for "
        "differences in feature lengths within a sample. In addition to "
        "feature length, FPKM and TPM attempt to account for differences "
        "between samples in sequencing effort. An advantage of TMP over "
        "FPKM is that TPM is a proportional measurement, making it "
        "easier to identify the extent that the relative 'importance' of "
        "a given feature changes between samples. A custom transformation "
        "can also be performed when used with the -k/--coeff argument, in "
        "which case the length normalized proportion of a feature will be "
        "multiplied by the provided scaling factor.")
    count_group.add_argument(
        '-k',
        '--coeff',
        metavar='MUL',
        dest='sfactor',
        type=float,
        default=1,
        help="multiplier to use when 'custom' is given to -u/--units "
        "[default: 1]")
    count_group.add_argument(
        '--cdna',
        dest='transcripts',
        action='store_true',
        help="sequences represent cDNA [default: False]. Whether sequences are "
        "from gDNA or cDNA will determine how the length of a feature is "
        "calculated for normalization. If cDNA, effective length will "
        "serve as feature length")
    count_group.add_argument(
        '--nonunique',
        action='store_true',
        help="allow reads to align with more than one feature")
    output_group = parser.add_argument_group('output control arguments')
    output_group.add_argument(
        '-o',
        '--outpref',
        type=str,
        metavar='PREFIX',
        dest='outpref',
        default='sample',
        help="prefix for the output tabular files containing feature abundance "
        "estimates [default: sample]. File names will be appended with "
        "the units, file format, and compression algorithm, if relevant "
        "[e.g. sample.counts.csv.gz]")
    output_group.add_argument(
        '--filter',
        dest='cat_only',
        action='store_true',
        help="only output abundances for features with an associated feature "
        "category [default: output all]")
    compression = output_group.add_mutually_exclusive_group()
    compression.add_argument('--gzip',
                             dest='gzipped',
                             action='store_true',
                             help="compress output using the gzip algorithm")
    compression.add_argument('--bzip2',
                             dest='bzipped',
                             action='store_true',
                             help="compress output using the bzip2 algorithm")
    compression.add_argument('--lzma',
                             dest='lzma',
                             action='store_true',
                             help="compress output using the lzma algorithm")
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + __version__)
    args = parser.parse_args()

    # Argument sanity checks
    if (args.category and not args.map_files) or \
        (args.map_files and not args.category):
        parser.error("error: -m/--mapping and -c/--category must be supplied "
                     "together")

    if args.alignment_file == '-' and args.feature_file == '-':
        parser.error("error: standard input (stdin) can only be redirected to "
                     "a single positional argument")

    # Output run information
    all_args = sys.argv[1:]
    print("{} {!s}".format('count_features', __version__), file=sys.stderr)
    print(textwrap.fill("Command line parameters: {}"\
          .format(' '.join(all_args)), 79), file=sys.stderr)
    print("", file=sys.stderr)

    # Track program run-time
    start_time = time()

    # Assign variables based on user inputs
    if args.gzipped:
        compression = '.gz'
    elif args.bzipped:
        compression = '.bz2'
    elif args.lzma:
        compression = '.xz'
    else:
        compression = ''

    allowed_units = ["counts", "tpm", "custom", "prop", "fpk", "fpkm"]

    out_handles = {}
    for unit in args.norm:
        if unit not in allowed_units:
            print("warning: unknown metric of abundance '{}' provided to "
                  "-u/--unit. Please see the help message for a list of the "
                  "allowed units".format(unit),
                  file=sys.stderr)
            continue

        outfile = "{}.{}.csv{}".format(args.outpref, unit, compression)
        try:
            out_h = open_io(outfile, mode='wb')
        except AttributeError:
            print("error: unable to write to '{}'".format(outfile), \
                  file=sys.stderr)
            sys.exit(1)

        out_handles[unit] = out_h

    if not out_handles:
        print(
            "error: no output files can be created. Please re-run with one "
            "or more of the accepted units of abundance",
            file=sys.stderr)
        sys.exit(1)

    overlap_mode = args.mode
    minaqual = args.minqual
    feature_type = args.ftype
    id_field = args.attr
    category_field = args.category
    are_transcripts = args.transcripts
    category_only = args.cat_only
    multi_aln = args.nonunique

    match_types = ('M', '=', 'X')

    if args.aformat == "sam":
        align_reader = HTSeq.SAM_Reader
    else:  #must be BAM then
        align_reader = HTSeq.BAM_Reader

    if args.map_files:
        mapping = load_dbs(args.map_files, fields=[category_field], csv=False)
    else:
        mapping = None

    # Store features in genomic arrays
    features = HTSeq.GenomicArrayOfSets("auto", stranded=False)
    counts = {}

    # Iterate over GFF3 file, storing features to estimate coverage for
    no_attr = 0
    f_totals = 0
    ftype_totals = 0

    try:
        if args.feature_file == '-':
            gff = HTSeq.GFF_Reader(sys.stdin)
        else:
            gff = HTSeq.GFF_Reader(args.feature_file)

        for f in gff:
            f_totals += 1

            try:
                feature_id = f.attr[id_field]
            except KeyError:
                no_attr += 1
                feature_id = "unkwn_{:08}".format(no_attr)

            # Skip features of wrong type
            if feature_type:
                if f.type == feature_type:
                    ftype_totals += 1
                else:
                    continue

            # Store feature length for normalization
            feature_length = abs(f.iv.end - f.iv.start + 1)

            features[f.iv] += feature_id  #for mapping alignments
            counts[feature_id] = {'count': 0, 'length': feature_length}
    except:
        print("error: problem occured when processing GFF3 file at line {}".
              format(gff.get_line_number_string()),
              file=sys.stderr)
        sys.exit(1)

    # Verify GFF3 file contains features of the specified type
    if ftype_totals == 0:
        print("error: no features of type '{}' found.\n".format(feature_type),
              file=sys.stderr)
        sys.exit(1)

    if no_attr > 0:
        print("warning: found {!s} features without a '{}' attribute.\n"\
              .format(no_attr, id_field), file=sys.stderr)

    # Check alignment file formatting
    try:
        if args.alignment_file == '-':
            read_seq_file = align_reader(sys.stdin)
            read_seq_iter = iter(read_seq_file)
            first_read = next(read_seq_iter)
            read_seq = itertools.chain([first_read], read_seq_iter)
        else:
            read_seq_file = align_reader(args.alignment_file)
            read_seq = read_seq_file
            first_read = next(iter(read_seq))
    except:
        print(
            "error: unable to read the alignment file. Please verify that "
            "the formatting is correct.",
            file=sys.stderr)
        sys.exit(1)

    pe_mode = first_read.paired_end  #reads are paired-end or single-end
    if pe_mode:
        if args.order == "name":
            read_seq = HTSeq.pair_SAM_alignments(read_seq)
        else:  #order is by position
            read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq, \
                       max_buffer_size=args.buffer_size)

    # Iterate over alignment file
    empty = 0  #reads aligned somewhere in the assembly, but not to a feature
    duplicate = 0  #reads are duplicates of other reads
    ambiguous = 0  #reads overlapping more than one feature
    notaligned = 0  #unaligned reads
    lowqual = 0  #reads not passing minimum threshold for alignment quality
    nonunique = 0  #reads having multiple alignments with similar score
    r_totals = 0  #total reads
    aln_totals = 0  #correctly mapped to a feature
    fld = []  #fragment length / insert-size distribution
    for r in read_seq:

        r_totals += 1

        if not pe_mode:  #single-end read mapping

            # Check if read aligned
            if not r.aligned:
                notaligned += 1
                continue

            # Check if the read aligned uniquely
            try:
                if r.optional_field("NH") > 1:
                    nonunique += 1
                    print("warning: read '{}' has multiple alignments with "
                          "similar score.\n".format(r.iv.chrom), \
                          file=sys.stderr)
                    continue
            except KeyError:
                pass

            # Cehck if the alignment passed the quality requirement
            if r.aQual < minaqual:
                lowqual += 1
                continue

            # Check whether the read was marked as a duplciate
            if r.pcr_or_optical_duplicate:
                duplicate += 1
                continue

            # Store read coordiantes
            iv_seq = (co.ref_iv for co in r.cigar if co.type in match_types \
                      and co.size > 0)

        else:  #paired-end read mapping

            # Store pair coordinates
            try:
                first_r, second_r = r
            except ValueError:
                notaligned += 1
                continue

            if first_r is None or second_r is None:
                notaligned += 1
                continue

            if first_r is not None and first_r.aligned:
                iv_seq = (co.ref_iv for co in first_r.cigar if co.type in \
                          match_types and co.size > 0)
            else:
                iv_seq = tuple()

            if second_r is not None and second_r.aligned:
                iv_seq = itertools.chain(iv_seq, (co.ref_iv for co in \
                         second_r.cigar if co.type in match_types and \
                         co.size > 0))
            else:
                if (first_r is None) or not (first_r.aligned):
                    notaligned += 1
                    continue

            # Check whether either read aligned more than once
            try:
                if (first_r.optional_field("NH") > 1) or \
                   (second_r.optional_field("NH") > 1):
                    nonunique += 1
                    print("warning: read '{}' has multiple alignments with "
                          "similar score.\n".format(first_r.iv.chrom), \
                          file=sys.stderr)
                    continue
            except KeyError:
                pass

            # Check if both reads passed the quality requirement
            if first_r.aQual < minaqual or second_r.aQual < minaqual:
                lowqual += 1
                continue

            # Check if the read pair was marked as a duplicate
            if first_r.pcr_or_optical_duplicate or \
                second_r.pcr_or_optical_duplicate:
                duplicate += 1
                continue

            # Append fragment length/insert-size to distribution
            try:
                fld.append(first_r.inferred_insert_size)
            except AttributeError:
                pass

        # Handle case where reads might overlap more than one feature
        try:
            if overlap_mode == "union":
                fs = set()  #store feature names when reads align
                for iv in iv_seq:
                    if iv.chrom not in features.chrom_vectors:
                        raise UnknownChrom

                    for iv2, fs2 in features[iv].steps():
                        fs = fs.union(fs2)

            else:  #intersection
                fs = None
                for iv in iv_seq:
                    if iv.chrom not in features.chrom_vectors:
                        raise UnknownChrom

                    for iv2, fs2 in features[iv].steps():
                        if len(fs2
                               ) > 0 or overlap_mode == "intersection-strict":
                            if fs is None:
                                fs = fs2.copy()
                            else:
                                fs = fs.intersection(fs2)

            # If a read correctly mapped to a feature, increment its abundance
            if not fs:
                empty += 1
                continue
            elif len(fs) > 1:
                ambiguous += 1
                if not multi_aln:
                    continue
            else:
                aln_totals += 1

            for fsi in list(fs):
                counts[fsi]['count'] += 1

        except UnknownChrom:
            empty += 1

    unaln_totals = empty + ambiguous + lowqual + notaligned + nonunique + \
                      duplicate
    nmapped = aln_totals + empty + ambiguous + nonunique + lowqual

    for unit in out_handles:
        # Set scaling function
        if unit == 'fpk':
            norm_method = scale_abundance_fpk
            scaling_factor = None
        elif unit == 'fpkm':
            norm_method = scale_abundance_fpkm
            # Scaling factor is all mapped reads
            scaling_factor = nmapped
            print("info: the total number of mapped reads used in calculation "
                  "of FPKM is {!s}.\n".format(nmapped),
                  file=sys.stderr)
        elif unit == 'tpm':
            norm_method = scale_abundance_tpm
            rates = [counts[j]['count'] / counts[j]['length'] for j in counts]
            rate_sum = sum(rates)
            print("info: the sum of all counts per bp rates used in "
                    "estimating fragment proportions is {:.2f}.\n"\
                  .format(rate_sum), file=sys.stderr)
            # Scaling factor is sum of all reads per base rates
            scaling_factor = rate_sum
        elif unit == 'custom':
            norm_method = scale_abundance_prop
            rates = [counts[j]['count'] / counts[j]['length'] for j in counts]
            rate_sum = sum(rates)
            print("info: the sum of all counts per bp rates used in "
                    "estimating fragment proportions is {:.2f}.\n"\
                  .format(rate_sum), file=sys.stderr)
            scaling_factor = args.sfactor / rate_sum
        elif unit == 'prop':
            norm_method = scale_abundance_prop
            rates = [counts[j]['count'] / counts[j]['length'] for j in counts]
            rate_sum = sum(rates)
            print("info: the sum of all counts per bp rates used in "
                    "estimating fragment proportions is {:.2f}.\n"\
                  .format(rate_sum), file=sys.stderr)
            scaling_factor = 1 / rate_sum
        else:  #default is counts
            norm_method = scale_abundance_none
            scaling_factor = None

        if are_transcripts and not pe_mode:
            print(
                "warning: unable to calculate effective length from single-end "
                "reads. Will use sequence length instead.\n",
                file=sys.stderr)
            calc_length = return_first_arg
        elif are_transcripts and pe_mode:
            calc_length = compute_effective_length
        else:
            calc_length = return_first_arg

        out_h = out_handles[unit]

        # Abundance normalization
        abundances = {}
        unkwn_feat = 0
        no_map = 0
        for feature in counts:

            fcount = counts[feature]['count']
            flen = calc_length(counts[feature]['length'], fld)

            feature_abundance = norm_method(fcount, flen, scaling_factor)

            # Map to higher order features, if applicable
            if category_field:
                try:
                    # Ensure that feature has corresponding entry in database
                    feature_map = mapping[feature]
                except KeyError:
                    no_map += 1
                    if not category_only:
                        # Keep all features, even the uncategorized ones
                        abundances[feature] = abundances.get(feature, 0) + \
                                              feature_abundance
                    continue
                else:
                    try:
                        # Ensure that entry has relevant category field
                        category = feature_map[category_field]
                    except KeyError:
                        unkwn_feat += 1
                        if not category_only:
                            abundances[feature] = abundances.get(feature, 0) + \
                                                  feature_abundance
                        continue

                # Handle case where feature has more than one category, such
                # as if a protein sequence is assigned to more than one gene
                # family
                categories = [category] if not type(category) == type(list()) \
                    else category
                for category in categories:
                    abundances[category.lstrip()] = \
                        abundances.get(category, 0) + feature_abundance

            else:
                abundances[feature] = abundances.get(feature, 0) + \
                                      feature_abundance

        # "UNMAPPED" can be interpreted as a single unknown gene of length one
        # kilobase recruiting all reads that failed to map to input features
        #abundances['UNMAPPED'] = unaln_totals

        # Output abundances sorted by key name
        for fn in sorted(abundances):
            if not fn.startswith("unkwn_"):
                write_io(out_h, "{}\t{!s}\n".format(fn, abundances[fn]))

        out_h.close()

    if unkwn_feat > 0:
        print("warning: found '{!s}' features without the '{}' field in the "
              "relational database.\n".format(unkwn_feat, category_field), \
              file=sys.stderr)

    if no_map > 0:
        print("warning: found {!s} features without an entry in the "
              "relational database.\n".format(no_map),
              file=sys.stderr)

    # Output statistics
    print("Features processed:", file=sys.stderr)
    print("  - feature totals:\t{!s}".format(f_totals), file=sys.stderr)
    if feature_type:
        print("  - of relevant type:\t{!s}".format(ftype_totals), \
              file=sys.stderr)
    print("  - unique features:\t{!s}".format(len(counts)), file=sys.stderr)
    print("Reads processed:", file=sys.stderr)
    print("  - read totals:\t{!s}".format(r_totals), file=sys.stderr)
    print("  - successfully mapped:\t{!s}".format(aln_totals), \
          file=sys.stderr)
    if multi_aln:
        print("    - ambiguous alignment:\t{!s}".format(ambiguous), \
              file=sys.stderr)
    print("  - unsuccessfully mapped:\t{!s}".format(unaln_totals), \
          file=sys.stderr)
    print("    - no feature\t{!s}".format(empty), file=sys.stderr)
    if not multi_aln:
        print("    - ambiguous alignment\t{!s}".format(ambiguous), \
              file=sys.stderr)
    print("    - too low alignment quality\t{!s}".format(lowqual), \
          file=sys.stderr)
    print("    - not aligned\t{!s}".format(notaligned), file=sys.stderr)
    print("    - duplicate\t{!s}".format(duplicate), file=sys.stderr)
    print("    - alignment not unique\t{!s}".format(nonunique), \
          file=sys.stderr)
    print("", file=sys.stderr)

    # Calculate and print program run-time
    end_time = time()
    total_time = (end_time - start_time) / 60.0
    print("It took {:.2e} minutes to count {!s} fragments for {!s} features"\
          .format(total_time, r_totals, f_totals), file=sys.stderr)
    print("", file=sys.stderr)
示例#18
0
def count_reads_in_features(sam_filenames, gff_filename,
                            samtype,
                            order, max_buffer_size,
                            stranded, overlap_mode,
                            multimapped_mode,
                            secondary_alignment_mode,
                            supplementary_alignment_mode,
                            feature_type, id_attribute,
                            additional_attributes,
                            quiet, minaqual, samouts):

    def write_to_samout(r, assignment, samoutfile):
        if samoutfile is None:
            return
        if not pe_mode:
            r = (r,)
        for read in r:
            if read is not None:
                read.optional_fields.append(('XF', assignment))
                samoutfile.write(read.get_sam_line() + "\n")

    if samtype == "sam":
        SAM_or_BAM_Reader = HTSeq.SAM_Reader
        samname = 'SAM'
    elif samtype == "bam":
        SAM_or_BAM_Reader = HTSeq.BAM_Reader
        samname = 'BAM'
    else:
        raise ValueError("Unknown input format %s specified." % samtype)

    if samouts != []:
        if len(samouts) != len(sam_filenames):
            raise ValueError(
                    'Select the same number of {:} input and output files'.format(samname))
        # Try to open samout files early in case any of them has issues
        for samout in samouts:
            with open(samout, 'w'):
                pass

    # Try to open samfiles to fail early in case any of them is not there
    if (len(sam_filenames) != 1) or (sam_filenames[0] != '-'):
        for sam_filename in sam_filenames:
            with open(sam_filename):
                pass

    # CIGAR match characters (including alignment match, sequence match, and
    # sequence mismatch
    com = ('M', '=', 'X')

    features = HTSeq.GenomicArrayOfSets("auto", stranded != "no")
    gff = HTSeq.GFF_Reader(gff_filename)
    counts = {}
    attributes = {}
    i = 0
    try:
        for f in gff:
            if f.type == feature_type:
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    raise ValueError(
                            "Feature %s does not contain a '%s' attribute" %
                            (f.name, id_attribute))
                if stranded != "no" and f.iv.strand == ".":
                    raise ValueError(
                            "Feature %s at %s does not have strand information but you are "
                            "running htseq-count in stranded mode. Use '--stranded=no'." %
                            (f.name, f.iv))
                features[f.iv] += feature_id
                counts[f.attr[id_attribute]] = 0
                attributes[f.attr[id_attribute]] = [
                        f.attr[attr] if attr in f.attr else ''
                        for attr in additional_attributes]
            i += 1
            if i % 100000 == 0 and not quiet:
                sys.stderr.write("%d GFF lines processed.\n" % i)
                sys.stderr.flush()
    except:
        sys.stderr.write(
            "Error occured when processing GFF file (%s):\n" %
            gff.get_line_number_string())
        raise

    if not quiet:
        sys.stderr.write("%d GFF lines processed.\n" % i)
        sys.stderr.flush()

    if len(counts) == 0:
        sys.stderr.write(
            "Warning: No features of type '%s' found.\n" % feature_type)

    counts_all = []
    empty_all = []
    ambiguous_all = []
    notaligned_all = []
    lowqual_all = []
    nonunique_all = []
    for isam, (sam_filename) in enumerate(sam_filenames):
        if samouts != []:
            samoutfile = open(samouts[isam], 'w')
        else:
            samoutfile = None

        try:
            if sam_filename == "-":
                read_seq_file = SAM_or_BAM_Reader(sys.stdin)
            else:
                read_seq_file = SAM_or_BAM_Reader(sam_filename)
            read_seq_iter = iter(read_seq_file)
            # Catch empty BAM files
            try:
                first_read = next(read_seq_iter)
                pe_mode = first_read.paired_end
            except:
                first_read = None
                pe_mode = False
            if first_read is not None:
                read_seq = itertools.chain([first_read], read_seq_iter)
            else:
                read_seq = []
        except:
            sys.stderr.write(
                "Error occured when reading beginning of {:} file.\n".format(
                    samname))
            raise

        try:
            if pe_mode:
                if ((supplementary_alignment_mode == 'ignore') and
                   (secondary_alignment_mode == 'ignore')):
                    primary_only = True
                else:
                    primary_only = False
                if order == "name":
                    read_seq = HTSeq.pair_SAM_alignments(
                            read_seq,
                            primary_only=primary_only)
                elif order == "pos":
                    read_seq = HTSeq.pair_SAM_alignments_with_buffer(
                            read_seq,
                            max_buffer_size=max_buffer_size,
                            primary_only=primary_only)
                else:
                    raise ValueError("Illegal order specified.")
            empty = 0
            ambiguous = 0
            notaligned = 0
            lowqual = 0
            nonunique = 0
            i = 0
            for r in read_seq:
                if i > 0 and i % 100000 == 0 and not quiet:
                    sys.stderr.write(
                        "%d %s alignment record%s processed.\n" %
                        (i, samname, "s" if not pe_mode else " pairs"))
                    sys.stderr.flush()

                i += 1
                if not pe_mode:
                    if not r.aligned:
                        notaligned += 1
                        write_to_samout(r, "__not_aligned", samoutfile)
                        continue
                    if ((secondary_alignment_mode == 'ignore') and
                       r.not_primary_alignment):
                        continue
                    if ((supplementary_alignment_mode == 'ignore') and
                       r.supplementary):
                        continue
                    try:
                        if r.optional_field("NH") > 1:
                            nonunique += 1
                            write_to_samout(
                                    r,
                                    "__alignment_not_unique",
                                    samoutfile)
                            if multimapped_mode == 'none':
                                continue
                    except KeyError:
                        pass
                    if r.aQual < minaqual:
                        lowqual += 1
                        write_to_samout(r, "__too_low_aQual", samoutfile)
                        continue
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r.cigar if co.type in com
                                  and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv)
                                  for co in r.cigar if (co.type in com and
                                                        co.size > 0))
                else:
                    if r[0] is not None and r[0].aligned:
                        if stranded != "reverse":
                            iv_seq = (co.ref_iv for co in r[0].cigar
                                      if co.type in com and co.size > 0)
                        else:
                            iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                                      if co.type in com and co.size > 0)
                    else:
                        iv_seq = tuple()
                    if r[1] is not None and r[1].aligned:
                        if stranded != "reverse":
                            iv_seq = itertools.chain(
                                    iv_seq,
                                    (invert_strand(co.ref_iv) for co in r[1].cigar
                                    if co.type in com and co.size > 0))
                        else:
                            iv_seq = itertools.chain(
                                    iv_seq,
                                    (co.ref_iv for co in r[1].cigar
                                     if co.type in com and co.size > 0))
                    else:
                        if (r[0] is None) or not (r[0].aligned):
                            write_to_samout(r, "__not_aligned", samoutfile)
                            notaligned += 1
                            continue
                    if secondary_alignment_mode == 'ignore':
                        if (r[0] is not None) and r[0].not_primary_alignment:
                            continue
                        elif (r[1] is not None) and r[1].not_primary_alignment:
                            continue
                    if supplementary_alignment_mode == 'ignore':
                        if (r[0] is not None) and r[0].supplementary:
                            continue
                        elif (r[1] is not None) and r[1].supplementary:
                            continue
                    try:
                        if ((r[0] is not None and r[0].optional_field("NH") > 1) or
                           (r[1] is not None and r[1].optional_field("NH") > 1)):
                            nonunique += 1
                            write_to_samout(r, "__alignment_not_unique", samoutfile)
                            if multimapped_mode == 'none':
                                continue
                    except KeyError:
                        pass
                    if ((r[0] and r[0].aQual < minaqual) or
                       (r[1] and r[1].aQual < minaqual)):
                        lowqual += 1
                        write_to_samout(r, "__too_low_aQual", samoutfile)
                        continue

                try:
                    if overlap_mode == "union":
                        fs = set()
                        for iv in iv_seq:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                fs = fs.union(fs2)
                    elif overlap_mode in ("intersection-strict",
                                          "intersection-nonempty"):
                        fs = None
                        for iv in iv_seq:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                if ((len(fs2) > 0) or
                                   (overlap_mode == "intersection-strict")):
                                    if fs is None:
                                        fs = fs2.copy()
                                    else:
                                        fs = fs.intersection(fs2)
                    else:
                        sys.exit("Illegal overlap mode.")

                    if fs is None or len(fs) == 0:
                        write_to_samout(r, "__no_feature", samoutfile)
                        empty += 1
                    elif len(fs) > 1:
                        write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]",
                                        samoutfile)
                        ambiguous += 1
                    else:
                        write_to_samout(r, list(fs)[0], samoutfile)

                    if fs is not None and len(fs) > 0:
                        if multimapped_mode == 'none':
                            if len(fs) == 1:
                                counts[list(fs)[0]] += 1
                        elif multimapped_mode == 'all':
                            for fsi in list(fs):
                                counts[fsi] += 1
                        else:
                            sys.exit("Illegal multimap mode.")


                except UnknownChrom:
                    write_to_samout(r, "__no_feature", samoutfile)
                    empty += 1

        except:
            sys.stderr.write(
                "Error occured when processing %s input (%s):\n" %
                (samname, read_seq_file.get_line_number_string()))
            raise

        if not quiet:
            sys.stderr.write(
                "%d %s %s processed.\n" %
                (i, samname, "alignments " if not pe_mode else "alignment pairs"))
            sys.stderr.flush()

        if samoutfile is not None:
            samoutfile.close()

        counts_all.append(counts.copy())
        for fn in counts:
            counts[fn] = 0
        empty_all.append(empty)
        ambiguous_all.append(ambiguous)
        lowqual_all.append(lowqual)
        notaligned_all.append(notaligned)
        nonunique_all.append(nonunique)

    pad = ['' for attr in additional_attributes]
    for fn in sorted(counts.keys()):
        print('\t'.join([fn] + attributes[fn] + [str(c[fn]) for c in counts_all]))
    print('\t'.join(["__no_feature"] + pad + [str(c) for c in empty_all]))
    print('\t'.join(["__ambiguous"] + pad + [str(c) for c in ambiguous_all]))
    print('\t'.join(["__too_low_aQual"] + pad + [str(c) for c in lowqual_all]))
    print('\t'.join(["__not_aligned"] + pad + [str(c) for c in notaligned_all]))
    print('\t'.join(["__alignment_not_unique"] + pad + [str(c) for c in nonunique_all]))
def main():
    parser = argparse.ArgumentParser(
        description='Assign reads to different genomic regions')
    parser.add_argument(
        '--input',
        '-i',
        type=str,
        required=True,
        help=
        "Input bam file, should in hg38 coordinate, can either sorted by query name or coordiante"
    )
    parser.add_argument('--strandness',
                        '-s',
                        type=str,
                        default="no",
                        choices=["forward", "reverse", "no"])
    parser.add_argument('--output',
                        '-o',
                        type=str,
                        required=True,
                        help="Output reads assignment")
    parser.add_argument('--beddir',
                        '-bd',
                        type=str,
                        default="genome/bed",
                        help="Dir that contains bed files")
    parser.add_argument(
        '--priority',
        '-p',
        type=str,
        default=
        "lncRNA,mRNA,snoRNA,snRNA,srpRNA,tRNA,tucpRNA,Y_RNA,pseudogene,exon,intron,antisense,promoter,enhancer,repeats"
    )
    args = parser.parse_args()

    bam = HTSeq.BAM_Reader(args.input)
    regions = args.priority.strip().split(",")
    print("Load genomic regions ...")
    if args.strandness != "no":
        ga = HTSeq.GenomicArrayOfSets("auto", stranded=True)
    else:
        ga = HTSeq.GenomicArrayOfSets("auto", stranded=False)
    for region in regions:
        bed = args.beddir + "/" + region + ".bed"
        with open(bed, "r") as f:
            for line in f:
                line = line.strip()
                if line.startswith("#") or len(line) == 0:
                    continue
                fields = line.split("\t")
                chrom, start, end, strand = fields[0], int(fields[1]), int(
                    fields[2]), fields[5]
                iv = HTSeq.GenomicInterval(chrom, start, end, strand=strand)
                ga[iv] += region
        print("{} loaded".format(region))
    print("Done .")
    stats = defaultdict(int)
    n_total_fragments = 0
    for read1, read2 in tqdm(
            HTSeq.pair_SAM_alignments_with_buffer(bam,
                                                  max_buffer_size=5000000)):
        n_total_fragments += 1
        # ignore singletons
        if (read1 is None) or (read2 is None):
            stats['singleton'] += 1
            continue
        # ignore unmapped reads
        if not (read1.aligned and read2.aligned):
            stats['unmapped'] += 1
            continue
        if read1.iv.chrom != read2.iv.chrom:
            stats['diff_chrom'] += 1
            continue
        if args.strandness == 'forward':
            read2.iv.strand = read1.iv.strand
        elif args.strandness == 'reverse':
            read1.iv.strand = read2.iv.strand
        else:
            read1.iv.strand = "."
            read2.iv.strand = "."
        featureSet = set()
        for iv0, step_set in ga[read1.iv].steps():
            featureSet = featureSet.union(step_set)
        for iv0, step_set in ga[read2.iv].steps():
            featureSet = featureSet.union(step_set)
        for region in regions:
            if region in featureSet:
                stats[region] += 1
                break

    n_assigned = pd.Series(stats).sum()
    stats["unassigned"] = n_total_fragments - n_assigned
    stats["total"] = n_total_fragments
    with open(args.output, "w") as f:
        for region in regions:
            print(region, stats[region], sep="\t", file=f)
        for each in [
                'singleton', 'unmapped', 'diff_chrom', 'unassigned', 'total'
        ]:
            print(each, stats[each], sep="\t", file=f)
示例#20
0
def count_reads(sam_filename, features, counts, samtype, order, forward,
                reverse, overlap_mode, quiet, minaqual, samout, directory):
    def write_to_samout(r, assignment):
        if samoutfile is None:
            return
        if not pe_mode:
            r = (r, )
        for read in r:
            if read is not None:
                samoutfile.write(read.original_sam_line.rstrip() + "\tXF:Z:" +
                                 assignment + "\n")

    if samout != "":
        samoutfile = open(samout, "w")
    else:
        samoutfile = None

    if samtype is None:
        samtype = detect_sam_type(sam_filename)

    if samtype == "sam":
        SAM_or_BAM_Reader = HTSeq.SAM_Reader
    elif samtype == "bam":
        SAM_or_BAM_Reader = HTSeq.BAM_Reader
    else:
        raise ValueError("Unknown input format %s specified." % samtype)

    try:
        if sam_filename != "-":
            read_seq_file = SAM_or_BAM_Reader(sam_filename)
            read_seq = read_seq_file
            first_read = iter(read_seq).next()
        else:
            read_seq_file = SAM_or_BAM_Reader(sys.stdin)
            read_seq_iter = iter(read_seq_file)
            first_read = read_seq_iter.next()
            read_seq = itertools.chain([first_read], read_seq_iter)
        pe_mode = first_read.paired_end
    except:
        sys.stderr.write("Error occured when reading beginning "
                         "of SAM/BAM file.\n")
        raise

    try:
        if pe_mode:
            if order == "name":
                read_seq = HTSeq.pair_SAM_alignments(read_seq)
            elif order == "pos":
                read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq)
            else:
                raise ValueError("Illegal order specified.")
        if forward:
            empty_forward = 0
            ambiguous_forward = 0
            counts_forward = copy.copy(counts)
        if reverse:
            empty_reverse = 0
            ambiguous_reverse = 0
            counts_reverse = copy.copy(counts)
        notaligned = 0
        lowqual = 0
        nonunique = 0
        i = 0
        for r in read_seq:
            if i > 0 and i % 100000 == 0 and not quiet:
                sys.stderr.write("%d SAM alignment record%s processed.\n" %
                                 (i, "s" if not pe_mode else " pairs"))

            i += 1
            if not pe_mode:
                if not r.aligned:
                    notaligned += 1
                    write_to_samout(r, "__not_aligned")
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        nonunique += 1
                        write_to_samout(r, "__alignment_not_unique")
                        continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    lowqual += 1
                    write_to_samout(r, "__too_low_aQual")
                    continue
                if forward:
                    iv_seq_for = (co.ref_iv for co in r.cigar
                                  if co.type == "M" and co.size > 0)
                if reverse:
                    iv_seq_rev = (invert_strand(co.ref_iv) for co in r.cigar
                                  if co.type == "M" and co.size > 0)
            else:
                if r[0] is not None and r[0].aligned:
                    if forward:
                        iv_seq_for = (co.ref_iv for co in r[0].cigar
                                      if co.type == "M" and co.size > 0)
                    if reverse:
                        iv_seq_rev = (invert_strand(co.ref_iv)
                                      for co in r[0].cigar
                                      if co.type == "M" and co.size > 0)
                else:
                    iv_seq_rev = tuple()
                    iv_seq_for = tuple()
                if r[1] is not None and r[1].aligned:
                    if forward:
                        iv_seq_for = (itertools.chain(
                            iv_seq_for, (invert_strand(co.ref_iv)
                                         for co in r[1].cigar
                                         if co.type == "M" and co.size > 0)))
                    if reverse:
                        iv_seq_rev = itertools.chain(
                            iv_seq_rev, (co.ref_iv for co in r[1].cigar
                                         if co.type == "M" and co.size > 0))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        write_to_samout(r, "__not_aligned")
                        notaligned += 1
                        continue
                try:
                    if ((r[0] is not None and r[0].optional_field("NH") > 1) or
                        (r[1] is not None and r[1].optional_field("NH") > 1)):
                        nonunique += 1
                        write_to_samout(r, "__alignment_not_unique")
                        continue
                except KeyError:
                    pass
                if ((r[0] and r[0].aQual < minaqual)
                        or (r[1] and r[1].aQual < minaqual)):
                    lowqual += 1
                    write_to_samout(r, "__too_low_aQual")
                    continue

            try:
                if overlap_mode == "union":
                    if forward:
                        fs_for = set()
                        for iv in iv_seq_for:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                fs_for = fs_for.union(fs2)
                    if reverse:
                        fs_rev = set()
                        for iv in iv_seq_rev:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                fs_rev = fs_rev.union(fs2)
                elif (overlap_mode == "intersection-strict"
                      or overlap_mode == "intersection-nonempty"):
                    if forward:
                        fs_for = None
                        for iv in iv_seq_for:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                if len(fs2) > 0 or \
                                        overlap_mode == "intersection-strict":
                                    if fs_for is None:
                                        fs_for = fs2.copy()
                                    else:
                                        fs_for = fs_for.intersection(fs2)
                    if reverse:
                        fs_reverse = None
                        for iv in iv_seq_rev:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                if len(fs2) > 0 or \
                                        overlap_mode == "intersection-strict":
                                    if fs_rev is None:
                                        fs_rev = fs2.copy()
                                    else:
                                        fs_rev = fs_rev.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")
                if forward:
                    if fs_for is None or len(fs_for) == 0:
                        write_to_samout(r, "__no_feature")
                        empty_forward += 1
                    elif len(fs_for) > 1:
                        write_to_samout(
                            r, "__ambiguous[" + '+'.join(fs_for) + "]")
                        ambiguous_forward += 1
                    else:
                        write_to_samout(r, list(fs_for)[0])
                        counts_forward[list(fs_for)[0]] += 1
                if reverse:
                    if fs_reverse is None or len(fs_rev) == 0:
                        write_to_samout(r, "__no_feature")
                        empty_reverse += 1
                    elif len(fs_reverse) > 1:
                        write_to_samout(
                            r, "__ambiguous[" + '+'.join(fs_rev) + "]")
                        ambiguous_reverse += 1
                    else:
                        write_to_samout(r, list(fs_rev)[0])
                        counts_reverse[list(fs_rev)[0]] += 1
            except UnknownChrom:
                write_to_samout(r, "__no_feature")
                empty_forward += 1
                empty_reverse += 1

    except:
        sys.stderr.write("Error occured when processing SAM input (%s):\n" %
                         read_seq_file.get_line_number_string())
        raise

    if not quiet:
        sys.stderr.write(
            "%d SAM %s processed.\n" %
            (i, "alignments " if not pe_mode else "alignment pairs"))

    if samoutfile is not None:
        samoutfile.close()

    if forward:
        output = brenninc_utils.create_new_file(sam_filename,
                                                "_forward_count",
                                                outputdir=directory,
                                                extension="txt",
                                                gzipped=False)
        used_features_count = 0
        used_features_sum = 0
        print "Forward written to", output
        with open(output, "w") as output_file:
            for fn in sorted(counts_forward.keys()):
                output_file.write("%s\t%d\n" % (fn, counts_forward[fn]))
                used_features_count += 1
                used_features_sum += counts_forward[fn]
            output_file.write("__no_feature\t%d\n" % empty_forward)
            output_file.write("__ambiguous\t%d\n" % ambiguous_forward)
            output_file.write("__too_low_aQual\t%d\n" % lowqual)
            output_file.write("__not_aligned\t%d\n" % notaligned)
            output_file.write("__alignment_not_unique\t%d\n" % nonunique)
        print "Forward features with alignment\t%d" % used_features_count
        print "Forward alignments asigned to feature\t%d" % used_features_sum
        print "__forward_no_feature\t%d" % empty_forward
        print "__forward_ambiguous\t%d" % ambiguous_forward
    if reverse:
        output = brenninc_utils.create_new_file(sam_filename,
                                                "_reverse_count",
                                                outputdir=directory,
                                                extension="txt",
                                                gzipped=False)
        used_features_count = 0
        used_features_sum = 0
        print "Reverse written to", output
        with open(output, "w") as output_file:
            for fn in sorted(counts_reverse.keys()):
                output.write("%s\t%d\n" % (fn, counts_reverse[fn]))
                used_features_count += 1
                used_features_sum += counts_reverse[fn]
            output_file.write("__no_feature\t%d\n" % empty_reverse)
            output_file.write("__ambiguous\t%d\n" % ambiguous_reverse)
            output_file.write("__too_low_aQual\t%d\n" % lowqual)
            output_file.write("__not_aligned\t%d\n" % notaligned)
            output_file.write("__alignment_not_unique\t%d\n" % nonunique)
        print "Reverse features with alignment\t%d" % used_features_count
        print "Reverse alignments asigned to feature\t%d" % used_features_sum
        print "__reverse_no_feature\t%d" % empty_reverse
        print "__reverse_ambiguous\t%d" % ambiguous_reverse
    print "__too_low_aQual\t%d" % lowqual
    print "__not_aligned\t%d" % notaligned
    print "__alignment_not_unique\t%d" % nonunique
    def _set_read_seq(
        self,
        supplementary_alignment_mode,
        secondary_alignment_mode,
        order,
        max_buffer_size,
    ):

        """
        Prepare the BAM/SAM file iterator.
        Note, only run this after _set_BAM_reader as you need self.read_seq_file to be set.
        This will create a parser and prepare an iterator for it.
        Depending on whether we have paired-end reads or not, different iterator
        will be returned.

        Parameters
        ----------
        supplementary_alignment_mode : str
            Whether to score supplementary alignments (0x800 flag).
            Choices: score or ignore.
        secondary_alignment_mode : str
            Whether to score secondary alignments (0x100 flag).
            Choices: score or ignore.
        order : str
            Can only be either 'pos' or 'name'. Sorting order of <alignment_file>.
        max_buffer_size : int
            When <alignment_file> is paired end sorted by position, allow only so many reads to stay in memory
            until the mates are found (raising this number will use more memory).
            Has no effect for single end or paired end sorted by name.

        """

        read_seq_iter = iter(self.read_seq_file)
        # Catch empty BAM files
        try:
            first_read = next(read_seq_iter)
            self.pe_mode = first_read.paired_end
        # FIXME: catchall can hide subtle bugs
        except:
            first_read = None
            self.pe_mode = False
        if first_read is not None:
            self.read_seq = itertools.chain([first_read], read_seq_iter)
        else:
            self.read_seq = []

        if self.pe_mode:
            if (supplementary_alignment_mode == "ignore") and (
                secondary_alignment_mode == "ignore"
            ):
                primary_only = True
            else:
                primary_only = False
            if order == "name":
                self.read_seq = HTSeq.pair_SAM_alignments(
                    self.read_seq, primary_only=primary_only
                )
            elif order == "pos":
                self.read_seq = HTSeq.pair_SAM_alignments_with_buffer(
                    self.read_seq,
                    max_buffer_size=max_buffer_size,
                    primary_only=primary_only,
                )
            else:
                raise ValueError("Illegal order specified.")
示例#22
0
def count_reads_in_features(sam_filenames, gff_filename, samtype, order,
                            max_buffer_size, stranded, overlap_mode,
                            multimapped_mode, secondary_alignment_mode,
                            supplementary_alignment_mode, feature_type,
                            id_attribute, additional_attributes, quiet,
                            minaqual, samouts, utr_tag):
    def write_to_samout(r, assignment, samoutfile):
        if samoutfile is None:
            return
        if not pe_mode:
            r = (r, )
        for read in r:
            if read is not None:
                read.optional_fields.append(('XF', assignment))
                samoutfile.write(read.get_sam_line() + "\n")

    if samtype == "sam":
        SAM_or_BAM_Reader = HTSeq.SAM_Reader
        samname = 'SAM'
    elif samtype == "bam":
        SAM_or_BAM_Reader = HTSeq.BAM_Reader
        samname = 'BAM'
    else:
        raise ValueError("Unknown input format %s specified." % samtype)

    if samouts != []:
        if len(samouts) != len(sam_filenames):
            raise ValueError(
                'Select the same number of {:} input and output files'.format(
                    samname))
        # Try to open samout files early in case any of them has issues
        for samout in samouts:
            with open(samout, 'w'):
                pass

    # Try to open samfiles to fail early in case any of them is not there
    if (len(sam_filenames) != 1) or (sam_filenames[0] != '-'):
        for sam_filename in sam_filenames:
            with open(sam_filename):
                pass

    # CIGAR match characters (including alignment match, sequence match, and
    # sequence mismatch
    com = ('M', '=', 'X')

    tags = utr_tag.split(",")

    features = HTSeq.GenomicArrayOfSets("auto", stranded != "no")
    gff = HTSeq.GFF_Reader(gff_filename)
    counts = {}
    attributes = {}
    i = 0
    try:
        for f in gff:
            if f.type == feature_type or f.type in tags:  # includes all entries with these tags
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    raise ValueError(
                        "Feature %s does not contain a '%s' attribute" %
                        (f.name, id_attribute))
                if stranded != "no" and f.iv.strand == ".":
                    raise ValueError(
                        "Feature %s at %s does not have strand information but you are "
                        "running htseq-count in stranded mode. Use '--stranded=no'."
                        % (f.name, f.iv))

                if f.type in tags:
                    features[f.iv] += f.attr[
                        id_attribute] + '^' + 'UTR'  # in features dictionary, which contains intervals for each region, each entry is tagged with their featuretype separated by a unique character
                else:
                    features[f.iv] += feature_id + '^' + f.type

                counts[f.attr[
                    id_attribute]] = 0  # the counts dictionary does not include this, meaning that the final counts will not include the added tag
                attributes[f.attr[id_attribute]] = [
                    f.attr[attr] if attr in f.attr else ''
                    for attr in additional_attributes
                ]
            i += 1
            if i % 100000 == 0 and not quiet:
                sys.stderr.write("%d GFF lines processed.\n" % i)
                sys.stderr.flush()
    except:
        sys.stderr.write("Error occured when processing GFF file (%s):\n" %
                         gff.get_line_number_string())
        raise

    if not quiet:
        sys.stderr.write("%d GFF lines processed.\n" % i)
        sys.stderr.flush()

    if len(counts) == 0:
        sys.stderr.write("Warning: No features of specified types found.\n")

    counts_all = []
    empty_all = []
    ambiguous_all = []
    notaligned_all = []
    lowqual_all = []
    nonunique_all = []
    for isam, (sam_filename) in enumerate(sam_filenames):
        if samouts != []:
            samoutfile = open(samouts[isam], 'w')
        else:
            samoutfile = None

        try:
            if sam_filename == "-":
                read_seq_file = SAM_or_BAM_Reader(sys.stdin)
            else:
                read_seq_file = SAM_or_BAM_Reader(sam_filename)
            read_seq_iter = iter(read_seq_file)
            # Catch empty BAM files
            try:
                first_read = next(read_seq_iter)
                pe_mode = first_read.paired_end
            except:
                first_read = None
                pe_mode = False
            if first_read is not None:
                read_seq = itertools.chain([first_read], read_seq_iter)
            else:
                read_seq = []
        except:
            sys.stderr.write(
                "Error occured when reading beginning of {:} file.\n".format(
                    samname))
            raise

        try:
            if pe_mode:
                if ((supplementary_alignment_mode == 'ignore')
                        and (secondary_alignment_mode == 'ignore')):
                    primary_only = True
                else:
                    primary_only = False
                if order == "name":
                    read_seq = HTSeq.pair_SAM_alignments(
                        read_seq, primary_only=primary_only)
                elif order == "pos":
                    read_seq = HTSeq.pair_SAM_alignments_with_buffer(
                        read_seq,
                        max_buffer_size=max_buffer_size,
                        primary_only=primary_only)
                else:
                    raise ValueError("Illegal order specified.")
            empty = 0
            ambiguous = 0
            notaligned = 0
            lowqual = 0
            nonunique = 0
            i = 0
            for r in read_seq:
                if i > 0 and i % 100000 == 0 and not quiet:
                    sys.stderr.write(
                        "%d %s alignment record%s processed.\n" %
                        (i, samname, "s" if not pe_mode else " pairs"))
                    sys.stderr.flush()

                i += 1
                if not pe_mode:
                    if not r.aligned:
                        notaligned += 1
                        write_to_samout(r, "__not_aligned", samoutfile)
                        continue
                    if ((secondary_alignment_mode == 'ignore')
                            and r.not_primary_alignment):
                        continue
                    if ((supplementary_alignment_mode == 'ignore')
                            and r.supplementary):
                        continue
                    try:
                        if r.optional_field("NH") > 1:
                            nonunique += 1
                            write_to_samout(r, "__alignment_not_unique",
                                            samoutfile)
                            if multimapped_mode == 'none':
                                continue
                    except KeyError:
                        pass
                    if r.aQual < minaqual:
                        lowqual += 1
                        write_to_samout(r, "__too_low_aQual", samoutfile)
                        continue
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r.cigar
                                  if co.type in com and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv) for co in r.cigar
                                  if (co.type in com and co.size > 0))
                else:
                    if r[0] is not None and r[0].aligned:
                        if stranded != "reverse":
                            iv_seq = (co.ref_iv for co in r[0].cigar
                                      if co.type in com and co.size > 0)
                        else:
                            iv_seq = (invert_strand(co.ref_iv)
                                      for co in r[0].cigar
                                      if co.type in com and co.size > 0)
                    else:
                        iv_seq = tuple()
                    if r[1] is not None and r[1].aligned:
                        if stranded != "reverse":
                            iv_seq = itertools.chain(
                                iv_seq, (invert_strand(co.ref_iv)
                                         for co in r[1].cigar
                                         if co.type in com and co.size > 0))
                        else:
                            iv_seq = itertools.chain(
                                iv_seq, (co.ref_iv for co in r[1].cigar
                                         if co.type in com and co.size > 0))
                    else:
                        if (r[0] is None) or not (r[0].aligned):
                            write_to_samout(r, "__not_aligned", samoutfile)
                            notaligned += 1
                            continue
                    if secondary_alignment_mode == 'ignore':
                        if (r[0] is not None) and r[0].not_primary_alignment:
                            continue
                        elif (r[1] is not None) and r[1].not_primary_alignment:
                            continue
                    if supplementary_alignment_mode == 'ignore':
                        if (r[0] is not None) and r[0].supplementary:
                            continue
                        elif (r[1] is not None) and r[1].supplementary:
                            continue
                    try:
                        if ((r[0] is not None
                             and r[0].optional_field("NH") > 1)
                                or (r[1] is not None
                                    and r[1].optional_field("NH") > 1)):
                            nonunique += 1
                            write_to_samout(r, "__alignment_not_unique",
                                            samoutfile)
                            if multimapped_mode == 'none':
                                continue
                    except KeyError:
                        pass
                    if ((r[0] and r[0].aQual < minaqual)
                            or (r[1] and r[1].aQual < minaqual)):
                        lowqual += 1
                        write_to_samout(r, "__too_low_aQual", samoutfile)
                        continue

                try:
                    if overlap_mode == "union":
                        fs = set()
                        for iv in iv_seq:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                fs = fs.union(fs2)
                    elif overlap_mode in ("intersection-strict",
                                          "intersection-nonempty"):
                        fs = None
                        for iv in iv_seq:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                if ((len(fs2) > 0) or
                                    (overlap_mode == "intersection-strict")):
                                    if fs is None:
                                        fs = fs2.copy()
                                    else:
                                        fs = fs.intersection(fs2)
                    else:
                        sys.exit("Illegal overlap mode.")

                    if fs is None or len(fs) == 0:
                        write_to_samout(r, "__no_feature", samoutfile)
                        empty += 1

                    elif len(fs) >= 3:
                        write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]",
                                        samoutfile)
                        ambiguous += 1
                    else:
                        write_to_samout(r, list(fs)[0], samoutfile)

                    # this change works on the assumption that in TAGseq, no reads should map to non-UTRs. Because manually added
                    # 3' UTRs may overlap with downstream genes
                    if fs is not None and len(
                            fs
                    ) > 0:  # since fs contains the gene names from the features dictionary, we can  differentiate between UTRs and CDS and prioritize whichever we need

                        if multimapped_mode == 'none':

                            if len(fs) == 1:
                                counts[list(fs)[0].split(
                                    '^'
                                )[0]] += 1  # if multimapped counts go to same ID.

                            elif len(
                                    fs
                            ) == 2:  # if mapping overlaps with two neighboring sequences and one of them is a UTR, count goes to UTR.

                                if list(fs)[0].split(
                                        '^')[-1] == 'UTR' and list(
                                            fs)[1].split('^')[-1] != 'UTR':
                                    counts[list(fs)[0].split('^')[0]] += 1

                                elif list(fs)[1].split(
                                        '^')[-1] == 'UTR' and list(
                                            fs)[0].split('^')[-1] != 'UTR':
                                    counts[list(fs)[1].split('^')[0]] += 1

                                else:  #  read overlaps with different features
                                    write_to_samout(
                                        r, "__ambiguous[" + '+'.join(fs) + "]",
                                        samoutfile)
                                    ambiguous += 1

                            # else:  #  read overlaps with more than one count
                            #     write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]",
                            #         samoutfile)
                            #     ambiguous += 1

                        elif multimapped_mode == 'all':
                            for fsi in list(fs):
                                counts[fsi] += 1
                        else:
                            sys.exit("Illegal multimap mode.")

                except UnknownChrom:
                    write_to_samout(r, "__no_feature", samoutfile)
                    empty += 1

        except:
            sys.stderr.write("Error occured when processing %s input (%s):\n" %
                             (samname, read_seq_file.get_line_number_string()))
            raise

        if not quiet:
            sys.stderr.write(
                "%d %s %s processed.\n" %
                (i, samname,
                 "alignments " if not pe_mode else "alignment pairs"))
            sys.stderr.flush()

        if samoutfile is not None:
            samoutfile.close()

        counts_all.append(counts.copy())
        for fn in counts:
            counts[fn] = 0
        empty_all.append(empty)
        ambiguous_all.append(ambiguous)
        lowqual_all.append(lowqual)
        notaligned_all.append(notaligned)
        nonunique_all.append(nonunique)

    pad = ['' for attr in additional_attributes]

    for fn in sorted(counts.keys()):
        print('\t'.join([fn] + attributes[fn] +
                        [str(c[fn]) for c in counts_all]))

    print('\t'.join(["__no_feature"] + pad + [str(c) for c in empty_all]))
    print('\t'.join(["__ambiguous"] + pad + [str(c) for c in ambiguous_all]))
    print('\t'.join(["__too_low_aQual"] + pad + [str(c) for c in lowqual_all]))
    print('\t'.join(["__not_aligned"] + pad + [str(c)
                                               for c in notaligned_all]))
    print('\t'.join(["__alignment_not_unique"] + pad +
                    [str(c) for c in nonunique_all]))
示例#23
0
def count_reads_in_features(sam_filenames, gff_filename,
                            samtype,
                            order, max_buffer_size,
                            stranded, overlap_mode,
                            multimapped_mode,
                            secondary_alignment_mode,
                            supplementary_alignment_mode,
                            feature_type, id_attribute,
                            additional_attributes,
                            quiet, minaqual, samouts):

    def write_to_samout(r, assignment, samoutfile):
        if samoutfile is None:
            return
        if not pe_mode:
            r = (r,)
        for read in r:
            if read is not None:
                read.optional_fields.append(('XF', assignment))
                samoutfile.write(read.get_sam_line() + "\n")

    if samouts != "":
        if len(samouts) != len(sam_filenames):
            raise ValueError('Select the same number of SAM input and output files')
        # Try to open samout files early in case any of them has issues
        for samout in samouts:
            with open(samout, 'w'):
                pass

    # Try to open samfiles to fail early in case any of them is not there
    if (len(sam_filenames) != 1) or (sam_filenames[0] != '-'):
        for sam_filename in sam_filenames:
            with open(sam_filename):
                pass

    # CIGAR match characters (including alignment match, sequence match, and
    # sequence mismatch
    com = ('M', '=', 'X')

    features = HTSeq.GenomicArrayOfSets("auto", stranded != "no")
    gff = HTSeq.GFF_Reader(gff_filename)
    counts = {}
    attributes = {}
    i = 0
    try:
        for f in gff:
            if f.type == feature_type:
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    raise ValueError("Feature %s does not contain a '%s' attribute" %
                                     (f.name, id_attribute))
                if stranded != "no" and f.iv.strand == ".":
                    raise ValueError("Feature %s at %s does not have strand information but you are "
                                     "running htseq-count in stranded mode. Use '--stranded=no'." %
                                     (f.name, f.iv))
                features[f.iv] += feature_id
                counts[f.attr[id_attribute]] = 0
                attributes[f.attr[id_attribute]] = [
                        f.attr[attr] if attr in f.attr else ''
                        for attr in additional_attributes]
            i += 1
            if i % 100000 == 0 and not quiet:
                sys.stderr.write("%d GFF lines processed.\n" % i)
                sys.stderr.flush()
    except:
        sys.stderr.write(
            "Error occured when processing GFF file (%s):\n" %
            gff.get_line_number_string())
        raise

    if not quiet:
        sys.stderr.write("%d GFF lines processed.\n" % i)
        sys.stderr.flush()

    if len(counts) == 0:
        sys.stderr.write(
            "Warning: No features of type '%s' found.\n" % feature_type)

    if samtype == "sam":
        SAM_or_BAM_Reader = HTSeq.SAM_Reader
    elif samtype == "bam":
        SAM_or_BAM_Reader = HTSeq.BAM_Reader
    else:
        raise ValueError("Unknown input format %s specified." % samtype)

    counts_all = []
    empty_all = []
    ambiguous_all = []
    notaligned_all = []
    lowqual_all = []
    nonunique_all = []
    for isam, (sam_filename) in enumerate(sam_filenames):
        if samouts != '':
            samoutfile = open(samouts[isam], 'w')
        else:
            samoutfile = None

        try:
            if sam_filename != "-":
                read_seq_file = SAM_or_BAM_Reader(sam_filename)
                read_seq = read_seq_file
                first_read = next(iter(read_seq))
            else:
                read_seq_file = SAM_or_BAM_Reader(sys.stdin)
                read_seq_iter = iter(read_seq_file)
                first_read = next(read_seq_iter)
                read_seq = itertools.chain([first_read], read_seq_iter)
            pe_mode = first_read.paired_end
        except:
            sys.stderr.write(
                "Error occured when reading beginning of SAM/BAM file.\n")
            raise

        try:
            if pe_mode:
                if ((supplementary_alignment_mode == 'ignore') and
                   (secondary_alignment_mode == 'ignore')):
                    primary_only = True
                else:
                    primary_only = False
                if order == "name":
                    read_seq = HTSeq.pair_SAM_alignments(
                            read_seq,
                            primary_only=primary_only)
                elif order == "pos":
                    read_seq = HTSeq.pair_SAM_alignments_with_buffer(
                            read_seq,
                            max_buffer_size=max_buffer_size,
                            primary_only=primary_only)
                else:
                    raise ValueError("Illegal order specified.")
            empty = 0
            ambiguous = 0
            notaligned = 0
            lowqual = 0
            nonunique = 0
            i = 0
            for r in read_seq:
                if i > 0 and i % 100000 == 0 and not quiet:
                    sys.stderr.write(
                        "%d SAM alignment record%s processed.\n" %
                        (i, "s" if not pe_mode else " pairs"))
                    sys.stderr.flush()

                i += 1
                if not pe_mode:
                    if not r.aligned:
                        notaligned += 1
                        write_to_samout(r, "__not_aligned", samoutfile)
                        continue
                    if ((secondary_alignment_mode == 'ignore') and
                       r.not_primary_alignment):
                        continue
                    if ((supplementary_alignment_mode == 'ignore') and
                       r.supplementary):
                        continue
                    try:
                        if r.optional_field("NH") > 1:
                            nonunique += 1
                            write_to_samout(r, "__alignment_not_unique", samoutfile)
                            if multimapped_mode == 'none':
                                continue
                    except KeyError:
                        pass
                    if r.aQual < minaqual:
                        lowqual += 1
                        write_to_samout(r, "__too_low_aQual", samoutfile)
                        continue
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r.cigar if co.type in com
                                  and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv)
                                  for co in r.cigar if (co.type in com and
                                                        co.size > 0))
                else:
                    if r[0] is not None and r[0].aligned:
                        if stranded != "reverse":
                            iv_seq = (co.ref_iv for co in r[0].cigar
                                      if co.type in com and co.size > 0)
                        else:
                            iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                                      if co.type in com and co.size > 0)
                    else:
                        iv_seq = tuple()
                    if r[1] is not None and r[1].aligned:
                        if stranded != "reverse":
                            iv_seq = itertools.chain(
                                    iv_seq,
                                    (invert_strand(co.ref_iv) for co in r[1].cigar
                                    if co.type in com and co.size > 0))
                        else:
                            iv_seq = itertools.chain(
                                    iv_seq,
                                    (co.ref_iv for co in r[1].cigar
                                     if co.type in com and co.size > 0))
                    else:
                        if (r[0] is None) or not (r[0].aligned):
                            write_to_samout(r, "__not_aligned", samoutfile)
                            notaligned += 1
                            continue
                    if secondary_alignment_mode == 'ignore':
                        if (r[0] is not None) and r[0].not_primary_alignment:
                            continue
                        elif (r[1] is not None) and r[1].not_primary_alignment:
                            continue
                    if supplementary_alignment_mode == 'ignore':
                        if (r[0] is not None) and r[0].supplementary:
                            continue
                        elif (r[1] is not None) and r[1].supplementary:
                            continue
                    try:
                        if ((r[0] is not None and r[0].optional_field("NH") > 1) or
                           (r[1] is not None and r[1].optional_field("NH") > 1)):
                            nonunique += 1
                            write_to_samout(r, "__alignment_not_unique", samoutfile)
                            if multimapped_mode == 'none':
                                continue
                    except KeyError:
                        pass
                    if ((r[0] and r[0].aQual < minaqual) or
                       (r[1] and r[1].aQual < minaqual)):
                        lowqual += 1
                        write_to_samout(r, "__too_low_aQual", samoutfile)
                        continue

                try:
                    if overlap_mode == "union":
                        fs = set()
                        for iv in iv_seq:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                fs = fs.union(fs2)
                    elif overlap_mode in ("intersection-strict",
                                          "intersection-nonempty"):
                        fs = None
                        for iv in iv_seq:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                if ((len(fs2) > 0) or
                                   (overlap_mode == "intersection-strict")):
                                    if fs is None:
                                        fs = fs2.copy()
                                    else:
                                        fs = fs.intersection(fs2)
                    else:
                        sys.exit("Illegal overlap mode.")

                    if fs is None or len(fs) == 0:
                        write_to_samout(r, "__no_feature", samoutfile)
                        empty += 1
                    elif len(fs) > 1:
                        write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]",
                                        samoutfile)
                        ambiguous += 1
                    else:
                        write_to_samout(r, list(fs)[0], samoutfile)

                    if fs is not None and len(fs) > 0:
                        if multimapped_mode == 'none':
                            if len(fs) == 1:
                                counts[list(fs)[0]] += 1
                        elif multimapped_mode == 'all':
                            for fsi in list(fs):
                                counts[fsi] += 1
                        else:
                            sys.exit("Illegal multimap mode.")


                except UnknownChrom:
                    write_to_samout(r, "__no_feature", samoutfile)
                    empty += 1

        except:
            sys.stderr.write(
                "Error occured when processing SAM input (%s):\n" %
                read_seq_file.get_line_number_string())
            raise

        if not quiet:
            sys.stderr.write(
                "%d SAM %s processed.\n" %
                (i, "alignments " if not pe_mode else "alignment pairs"))
            sys.stderr.flush()

        if samoutfile is not None:
            samoutfile.close()

        counts_all.append(counts.copy())
        for fn in counts:
            counts[fn] = 0
        empty_all.append(empty)
        ambiguous_all.append(ambiguous)
        lowqual_all.append(lowqual)
        notaligned_all.append(notaligned)
        nonunique_all.append(nonunique)

    pad = ['' for attr in additional_attributes]
    for fn in sorted(counts.keys()):
        print('\t'.join([fn] + attributes[fn] + [str(c[fn]) for c in counts_all]))
    print('\t'.join(["__no_feature"] + pad + [str(c) for c in empty_all]))
    print('\t'.join(["__ambiguous"] + pad + [str(c) for c in ambiguous_all]))
    print('\t'.join(["__too_low_aQual"] + pad + [str(c) for c in lowqual_all]))
    print('\t'.join(["__not_aligned"] + pad + [str(c) for c in notaligned_all]))
    print('\t'.join(["__alignment_not_unique"] + pad + [str(c) for c in nonunique_all]))
def main():
    parser = argparse.ArgumentParser(
        description='Assign reads to different genomic regions')
    parser.add_argument('--input',
                        '-i',
                        type=str,
                        required=True,
                        help="Input bam file, should in hg38 coordinate")
    parser.add_argument('--strandness',
                        '-s',
                        type=str,
                        default="no",
                        choices=["forward", "reverse", "no"])
    parser.add_argument(
        '--filter',
        '-f',
        type=int,
        default=3,
        help="Only consider exon with mean coverage higher than this value")
    parser.add_argument('--gtf',
                        '-a',
                        type=str,
                        default="genome/gtf/gencode.v27.annotation.gtf",
                        help="gtf annotation")
    parser.add_argument('--coverage',
                        '-c',
                        type=str,
                        required=True,
                        help="Output coverage")
    parser.add_argument('--pdf',
                        '-p',
                        type=str,
                        default=None,
                        help="Output coverage plot")
    args = parser.parse_args()

    if args.strandness != "no":
        ga1 = HTSeq.GenomicArray("auto", stranded=True)
        ga2 = HTSeq.GenomicArray("auto", stranded=True)
    else:
        ga1 = HTSeq.GenomicArray("auto", stranded=False)
        ga2 = HTSeq.GenomicArray("auto", stranded=False)

    #chr1	HAVANA	exon	12613	12721

    print("Load bam file ...")
    bam = HTSeq.BAM_Reader(args.input)
    for read1, read2 in tqdm(
            HTSeq.pair_SAM_alignments_with_buffer(bam,
                                                  max_buffer_size=5000000)):
        if (read1 is None) or (read2 is None):
            continue
        if not (read1.aligned and read2.aligned):
            continue
        if read1.iv.chrom != read2.iv.chrom:
            continue
        else:
            read1.iv.strand = "."
            read2.iv.strand = "."
        for cigop in read1.cigar:
            if cigop.type != "M":
                continue
            ga1[checkStrandness(cigop.ref_iv, "1", args.strandness)] += 1
        for cigop in read2.cigar:
            if cigop.type != "M":
                continue
            ga2[checkStrandness(cigop.ref_iv, "2", args.strandness)] += 1
    print("Done.")

    exonNumber = 0
    print("Get coverage of exons in gtf annotation ...")
    fivePrime1 = np.zeros(100)
    fivePrime2 = np.zeros(100)
    threePrime1 = np.zeros(100)
    threePrime2 = np.zeros(100)
    with open(args.gtf) as f:
        for line in tqdm(f):
            line = line.strip()
            if len(line) == 0 or line.startswith("#"):
                continue
            fields = line.split("\t")
            if fields[2] != "exon":
                continue
            strand = fields[6]
            if strand == "+":
                fivePrimeBoundary = HTSeq.GenomicInterval(
                    fields[0],
                    int(fields[3]) - 1 - 50,
                    int(fields[3]) + 49, strand)
                threePrimeBoundary = HTSeq.GenomicInterval(
                    fields[0],
                    int(fields[4]) - 1 - 50,
                    int(fields[4]) + 49, strand)
                if fivePrimeBoundary.start < 0 or threePrimeBoundary.start < 0:
                    continue
                fivePrime1_ = np.fromiter(ga1[fivePrimeBoundary], dtype="i")
                fivePrime2_ = np.fromiter(ga2[fivePrimeBoundary], dtype="i")
                threePrime1_ = np.fromiter(ga1[threePrimeBoundary], dtype="i")
                threePrime2_ = np.fromiter(ga2[threePrimeBoundary], dtype="i")
            else:
                fivePrimeBoundary = HTSeq.GenomicInterval(
                    fields[0],
                    int(fields[4]) - 1 - 50,
                    int(fields[4]) + 49, strand)
                threePrimeBoundary = HTSeq.GenomicInterval(
                    fields[0],
                    int(fields[3]) - 1 - 50,
                    int(fields[3]) + 49, strand)
                if fivePrimeBoundary.start < 0 or threePrimeBoundary.start < 0:
                    continue
                fivePrime1_ = np.fromiter(ga1[fivePrimeBoundary],
                                          dtype="i")[::-1]
                fivePrime2_ = np.fromiter(ga2[fivePrimeBoundary],
                                          dtype="i")[::-1]
                threePrime1_ = np.fromiter(ga1[threePrimeBoundary],
                                           dtype="i")[::-1]
                threePrime2_ = np.fromiter(ga2[threePrimeBoundary],
                                           dtype="i")[::-1]
            if (fivePrime1_.mean() >
                    args.filter) or (fivePrime2_.mean() > args.filter) or (
                        threePrime1_.mean() >
                        args.filter) or (threePrime2_.mean() > args.filter):
                exonNumber += 1
                fivePrime1 += fivePrime1_
                fivePrime2 += fivePrime2_
                threePrime1 += threePrime1_
                threePrime2 += threePrime2_

    print("Done .")
    df = pd.DataFrame({
        "read1-5p": fivePrime1,
        "read1-3p": threePrime1,
        "read2-5p": fivePrime2,
        "read2-3p": threePrime2
    })
    df = df / exonNumber
    df.to_csv(args.coverage, sep="\t")
    if args.pdf is not None:
        plotCoverage(df, args.pdf)
示例#25
0
def count_reads_in_features(sam_filename, gff_filename, samtype, order, overlap_mode,
    feature_type, id_attribute, quiet, minaqual, mapping_file, scale_method):

    features = HTSeq.GenomicArrayOfSets("auto", False)
    counts = {}

    # Try to open samfile to fail early in case it is not there
    if sam_filename != "-":
        open(sam_filename).close()

    # Try to open mapping file to fail early in case it is not there
    if mapping_file:
        open(mapping_file).close()

    gff = HTSeq.GFF_Reader(gff_filename)
    i = 0
    try:
        for f in gff:
            if f.type == feature_type:
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    continue
                features[f.iv] += feature_id
                counts[feature_id] = 0
            i += 1
            if i % 100000 == 0 and not quiet:
                sys.stderr.write("{!s} GFF lines processed.\n".format(i))
    except:
        sys.stderr.write("Error occured when processing GFF file ({}):\n"
            .format(gff.get_line_number_string()))
        raise

    if not quiet:
        sys.stderr.write("{!s} GFF lines processed.\n".format(i))

    num_features = len(counts)
    if num_features == 0:
        sys.stderr.write("Warning: No features of type '{}' found.\n"
            .format(feature_type))

    if samtype == "sam":
        align_reader = HTSeq.SAM_Reader
    elif samtype == "bam":
        align_reader = HTSeq.BAM_Reader
    else:
        raise ValueError, "Unknown input format {} specified.".format(samtype)

    try:
        if sam_filename != "-":
            read_seq_file = align_reader(sam_filename)
            read_seq = read_seq_file
            first_read = iter(read_seq).next()
        else:
            read_seq_file = align_reader(sys.stdin)
            read_seq_iter = iter(read_seq_file)
            first_read = read_seq_iter.next()
            read_seq = itertools.chain([first_read], read_seq_iter)
        pe_mode = first_read.paired_end
    except:
        sys.stderr.write("Error occured when reading SAM/BAM file.\n" )
        raise

    try:
        if pe_mode:
            if order == "name":
                read_seq = HTSeq.pair_SAM_alignments(read_seq)
            elif order == "position":
                read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq)
            else:
                raise ValueError, "Illegal order specified."
        empty = 0
        ambiguous = 0
        notaligned = 0
        lowqual = 0
        nonunique = 0
        i = 0
        for r in read_seq:
            if i > 0 and i % 100000 == 0 and not quiet:
                sys.stderr.write("{!s} SAM alignment record{} processed.\n"
                    .format(i, "s" if not pe_mode else " pairs"))

            i += 1
            if not pe_mode:
                if not r.aligned:
                    notaligned += 1
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        nonunique += 1
                        continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    lowqual += 1
                    continue
                iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0)
            else:
                if r[0] is not None and r[0].aligned:
                    iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0)
                else:
                    iv_seq = tuple()
                if r[1] is not None and r[1].aligned:
                    iv_seq = itertools.chain( iv_seq,
                        (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        notaligned += 1
                        continue
                try:
                    if (r[0] is not None and r[0].optional_field("NH") > 1 ) or \
                            (r[1] is not None and r[1].optional_field("NH") > 1):
                        nonunique += 1
                        continue
                except KeyError:
                    pass
                if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual):
                    lowqual += 1
                    continue

            try:
                if overlap_mode == "union":
                    fs = set()
                    for iv in iv_seq:
                         if iv.chrom not in features.chrom_vectors:
                             raise UnknownChrom
                         for iv2, fs2 in features[iv].steps():
                             fs = fs.union(fs2)
                elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
                    fs = None
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[ iv ].steps():
                            if len(fs2) > 0 or overlap_mode == "intersection-strict":
                                if fs is None:
                                    fs = fs2.copy()
                                else:
                                    fs = fs.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")
                if fs is None or len(fs) == 0:
                    empty += 1
                elif len(fs) > 1:
                    ambiguous += 1
                else:
                    counts[list(fs)[0]] += 1
            except UnknownChrom:
                empty += 1

    except:
        sys.stderr.write("Error occured when processing SAM input ({}):\n"
            .format(read_seq_file.get_line_number_string()))
        raise

    if not quiet:
        sys.stderr.write("{!s} SAM {} processed.\n"
            .format(i, "alignments " if not pe_mode else "alignment pairs"))

    # map to higher order features if applicable
    if mapping_file:
        abundances = {}
        with open(mapping_file) as mapping_h:
            for row in csv.reader(mapping_h, delimiter='\t'):
                try:
                    feature, feature_category, feature_length, organism = row
                except ValueError:
                    sys.stderr.write("Can't determine the format of '{}'".format(mapping_file))
                    raise
                if feature not in counts:
                    continue
                if not feature_category:
                    feature_category = feature
                abund = counts[feature] if scale_method == 'none' else scale_abundance(counts[feature], int(feature_length))
                if ',' in feature_category:
                    cats = feature_category.split(',')
                    for category in cats:
                        abundances[category] = abundances.get(category, 0) + abund
                else:
                    abundances[feature_category] = abundances.get(feature_category, 0) + abund

        if num_features > 0 and len(abundances) == 0:
            sys.stderr.write("Warning: No higher order features found. Please "
                "make sure the mapping file is formatted correctly.\n")

        for feature in counts:
            if feature not in abundances:
                abundances['UNMAPPED'] = abundances.get('UNMAPPED', 0) + counts[feature]

    else:
        abundances = counts

    # "UNMAPPED" can be interpreted as a single unknown gene of length 1
    # kilobase recruiting all reads that failed to map to known sequences
    abundances['UNMAPPED'] = (abundances.get('UNMAPPED', 0) + empty + ambiguous + lowqual + notaligned + nonunique)

    for fn in sorted(abundances.keys()):
        print("{}\t{!s}".format(fn, abundances[fn]))
    sys.stderr.write("__no_feature\t{!s}\n".format(empty))
    sys.stderr.write("__ambiguous\t{!s}\n".format(ambiguous))
    sys.stderr.write("__too_low_aQual\t{!s}\n".format(lowqual))
    sys.stderr.write("__not_aligned\t{!s}\n".format(notaligned))
    sys.stderr.write("__alignment_not_unique\t{!s}\n".format(nonunique))
def count_PE_reads(sam_files,
                   labels,
                   regions,
                   file_type="sam",
                   use_chrom_name=False,
                   order="name"):
    """ counts fragments (PE read pairs) for each region from all SAM/BAM files """

    assert len(sam_files) == len(labels)
    if use_chrom_name:
        print "INFO: Running in mode for counting per chromosome name."

    m = len(sam_files)

    # initialize a list with default zero counts
    all_counts = [collections.Counter() for i in range(m)]

    # iterate over all sam/bam files
    for j in range(m):

        print "INFO: Start to count reads in", sam_files[j], "..."

        if file_type == "sam":
            almnt_file = HTSeq.SAM_Reader(sam_files[j])
        else:
            almnt_file = HTSeq.BAM_Reader(sam_files[j])

        # pair alignment records according to PE pairs and iterate over pairs
        if order == "name":
            print "INFO: Assuming SAM/BAM file ordered by read name."
            alignmentIterator = HTSeq.pair_SAM_alignments(almnt_file)
        else:
            print "INFO: Assuming SAM/BAM file ordered by position"
            alignmentIterator = HTSeq.pair_SAM_alignments_with_buffer(
                almnt_file, max_buffer_size=100 * 3000000)

        for pair in alignmentIterator:

            first_almnt, second_almnt = pair  # extract pair

            # check if both pairs are mapped
            if first_almnt == None or second_almnt == None or not (
                    first_almnt.aligned and second_almnt.aligned):
                all_counts[j]["_unmapped"] += 1
                continue

            # potential speed up for transcript fragments as reference
            if use_chrom_name:

                if first_almnt.iv.chrom == second_almnt.iv.chrom:
                    all_counts[j][first_almnt.iv.chrom] += 1
                else:
                    all_counts[j]["_no_feature"] += 1

            else:
                # build set for all regions overalapping with the reads
                gene_ids_first = set()
                gene_ids_second = set()

                # extract all region names that overlap with the reads and add them to set
                for iv, val in regions[first_almnt.iv].steps():
                    gene_ids_first |= val
                for iv, val in regions[second_almnt.iv].steps():
                    gene_ids_second |= val

                # take only those genes that are common for first and second read
                gene_ids = gene_ids_first & gene_ids_second

                # handle read-pairs not mapped to a feature
                if len(gene_ids) == 0:
                    all_counts[j]["_no_feature"] += 1

                # if pair maps to a unique gene count it
                else:
                    # add increase counter for all genes
                    for gene_id in list(gene_ids):
                        all_counts[j][gene_id] += 1

    # return counts
    return (all_counts)
示例#27
0
def pool(infile, targets, intron_set, fiveSS, threeSS, Branches, Branchto3ss):

    SI_counts = defaultdict(int)
    junction_counts = defaultdict(int)

    for f, s in HTSeq.pair_SAM_alignments_with_buffer(
            HTSeq.BAM_Reader('%s/%s.bam' % (infile, infile))):

        if f != None and f.aligned == True and f.aQual > 5:
            chrome = f.iv.chrom
            start = f.iv.start
            end = f.iv.end
            strand = f.iv.strand
            if strand == '+':
                geneint = HTSeq.GenomicPosition(chrome, start, strand)
            else:
                geneint = HTSeq.GenomicPosition(chrome, end, strand)
            if len(targets[geneint]) == 0:
                introns = set()
                junctions = set()

                for i, cigop in enumerate(f.cigar):
                    if cigop.type == 'M':
                        for iv, val in targets[cigop.ref_iv].steps():
                            introns |= val

                    elif cigop.type == 'N':
                        if f.cigar[i - 1].type == 'M' and f.cigar[
                                i - 1].size > 3 and f.cigar[
                                    i +
                                    1].type == 'M' and f.cigar[i + 1].size > 3:
                            for iv, val in targets[cigop.ref_iv].steps():
                                junctions |= val

                            chrom = cigop.ref_iv.chrom
                            if cigop.ref_iv.strand == '+':
                                first = cigop.ref_iv.end
                                second = cigop.ref_iv.start + 1
                                strand = "+"
                            else:
                                first = cigop.ref_iv.start + 1
                                second = cigop.ref_iv.end
                                strand = '-'

                            if (chrom, first,
                                    strand) in fiveSS and (chrom, second,
                                                           strand) in threeSS:
                                up = fiveSS[chrom, first, strand]
                                down = threeSS[chrom, second, strand]
                                if up[0] == down[0]:
                                    if up[1] == down[1]:
                                        junction_counts[(infile, up[0],
                                                         int(up[1]),
                                                         int(down[1]) + 1,
                                                         "Constituitive")] += 1
                                    else:
                                        junction_counts[(infile, up[0],
                                                         int(up[1]),
                                                         int(down[1]) + 1,
                                                         "Exon Skipping")] += 1
                            elif (chrom, first, strand) in fiveSS:
                                junction_counts[(infile, up[0], int(up[1]),
                                                 int(down[1]) + 1,
                                                 "Alternative 3'")] += 1
                            elif (chrom, second, strand) in threeSS:
                                junction_counts[(infile, up[0], int(up[1]),
                                                 int(down[1]) + 1,
                                                 "Alternative 5'")] += 1

                intron_num_mat = {}
                intron_num_pre = {}
                intron = ''
                junction = ''

                if len(introns) > 0:
                    for i in introns:
                        a = i.split(';')
                        intron_num_pre[i] = a[1]
                    intron = max(intron_num_pre.items(), key=lambda x: x[1])
                    intron = intron[0]

                if len(junctions) > 0:
                    for i in junctions:
                        a = i.split(';')
                        intron_num_mat[i] = a[1]
                    junction = max(intron_num_mat.items(), key=lambda x: x[1])
                    junction = junction[0]

                if junction == intron:
                    intron = ''
                    junction = ''

                if junction and intron:
                    if junction.split(';')[1] > intron.split(';')[1]:
                        intron = ''
                    else:
                        junction = ''

                candidate_genes = set()
                for i in introns:
                    candidate_genes.add(i.split(';')[0])
                for i in junctions:
                    candidate_genes.add(i.split(';')[0])

                if len(candidate_genes) == 1:
                    if junction:
                        SI_counts[('mature', junction)] += 1
                    if intron:
                        SI_counts[('premature', intron)] += 1
                    if f.proper_pair == True and s.proper_pair == True and s.aligned == True and s.aQual > 5:
                        if junction:
                            SI_counts[('concordant_mature', junction)] += 1
                        if intron:
                            SI_counts[('concordant_premature', intron)] += 1

# Counts starting position of read 2's that fall within specified lariat intermediate and branch to 3'SS windows
                if intron > 0 and s.aligned == True and s.proper_pair == True and s.aQual > 5:
                    chrome = s.iv.chrom
                    start = s.iv.start
                    end = s.iv.end
                    strand = s.iv.strand
                    if strand == '+':
                        geneint = HTSeq.GenomicPosition(chrome, start, strand)
                    else:
                        geneint = HTSeq.GenomicPosition(chrome, end, strand)
                    if intron in Branches[geneint] and len(
                            Branches[geneint]) == 1:
                        SI_counts[('lariat_int', intron)] += 1
                    if intron in Branchto3ss[geneint] and len(
                            Branchto3ss[geneint]) == 1:
                        SI_counts[('branch_to3ss', intron)] += 1

    with open('%s/%s_splicing_counts.txt' % (infile, infile), 'w') as out:
        for intron in sorted(intron_set):
            out.write('%s\t%d\t%d\n' % (intron, SI_counts[('mature', intron)],
                                        SI_counts[('premature', intron)]))

    with open('%s/%s_concordant_splicing_counts.txt' % (infile, infile),
              'w') as out:
        for intron in sorted(intron_set):
            out.write('%s\t%d\t%d\n' %
                      (intron, SI_counts[('concordant_mature', intron)],
                       SI_counts[('concordant_premature', intron)]))

    with open('%s/%s_lariat_int_counts.txt' % (infile, infile), 'w') as out:
        for intron in sorted(intron_set):
            out.write('%s\t%d\n' % (intron, SI_counts[('lariat_int', intron)]))

    with open('%s/%s_branch_to3ss_counts.txt' % (infile, infile), 'w') as out:
        for intron in sorted(intron_set):
            out.write('%s\t%d\n' %
                      (intron, SI_counts[('branch_to3ss', intron)]))

    with open('%s/%s_junction_counts.txt' % (infile, infile), 'w') as out:
        out.write('Gene\tUpstream\tDownstream\tType\tCount\n')
        for junc in sorted(junction_counts):
            out.write(
                '%s\t%d\t%d\t%s\t%d\n' %
                (junc[1], junc[2], junc[3], junc[4], junction_counts[junc]))
示例#28
0
def count_reads_single_file(
    isam,
    sam_filename,
    features,
    feature_attr,
    order,
    max_buffer_size,
    stranded,
    overlap_mode,
    multimapped_mode,
    secondary_alignment_mode,
    supplementary_alignment_mode,
    feature_type,
    id_attribute,
    additional_attributes,
    quiet,
    minaqual,
    samout_format,
    samout_filename,
):
    def write_to_samout(r, assignment, samoutfile, template=None):
        if samoutfile is None:
            return
        if not pe_mode:
            r = (r, )
        for read in r:
            if read is not None:
                read.optional_fields.append(('XF', assignment))
                if samout_format in ('SAM', 'sam'):
                    samoutfile.write(read.get_sam_line() + "\n")
                else:
                    samoutfile.write(read.to_pysam_AlignedSegment(template))

    try:
        if sam_filename == "-":
            read_seq_file = HTSeq.BAM_Reader(sys.stdin)
        else:
            read_seq_file = HTSeq.BAM_Reader(sam_filename)

        # Get template for output BAM
        if samout_filename is None:
            template = None
            samoutfile = None
        elif samout_format in ('bam', 'BAM'):
            template = read_seq_file.get_template()
            samoutfile = pysam.AlignmentFile(
                samout_filename,
                'wb',
                template=template,
            )
        else:
            template = None
            samoutfile = open(samout_filename, 'w')

        read_seq_iter = iter(read_seq_file)
        # Catch empty BAM files
        try:
            first_read = next(read_seq_iter)
            pe_mode = first_read.paired_end
        # FIXME: catchall can hide subtle bugs
        except:
            first_read = None
            pe_mode = False
        if first_read is not None:
            read_seq = itertools.chain([first_read], read_seq_iter)
        else:
            read_seq = []
    except:
        sys.stderr.write(
            "Error occured when reading beginning of SAM/BAM file.\n")
        raise

    # CIGAR match characters (including alignment match, sequence match, and
    # sequence mismatch
    com = ('M', '=', 'X')
    counts = {key: 0 for key in feature_attr}

    try:
        if pe_mode:
            if ((supplementary_alignment_mode == 'ignore')
                    and (secondary_alignment_mode == 'ignore')):
                primary_only = True
            else:
                primary_only = False
            if order == "name":
                read_seq = HTSeq.pair_SAM_alignments(read_seq,
                                                     primary_only=primary_only)
            elif order == "pos":
                read_seq = HTSeq.pair_SAM_alignments_with_buffer(
                    read_seq,
                    max_buffer_size=max_buffer_size,
                    primary_only=primary_only)
            else:
                raise ValueError("Illegal order specified.")
        empty = 0
        ambiguous = 0
        notaligned = 0
        lowqual = 0
        nonunique = 0
        i = 0
        for r in read_seq:
            if i > 0 and i % 100000 == 0 and not quiet:
                sys.stderr.write("%d alignment record%s processed.\n" %
                                 (i, "s" if not pe_mode else " pairs"))
                sys.stderr.flush()

            i += 1
            if not pe_mode:
                if not r.aligned:
                    notaligned += 1
                    write_to_samout(r, "__not_aligned", samoutfile, template)
                    continue
                if ((secondary_alignment_mode == 'ignore')
                        and r.not_primary_alignment):
                    continue
                if ((supplementary_alignment_mode == 'ignore')
                        and r.supplementary):
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        nonunique += 1
                        write_to_samout(r, "__alignment_not_unique",
                                        samoutfile, template)
                        if multimapped_mode == 'none':
                            continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    lowqual += 1
                    write_to_samout(r, "__too_low_aQual", samoutfile, template)
                    continue
                if stranded != "reverse":
                    iv_seq = (co.ref_iv for co in r.cigar
                              if co.type in com and co.size > 0)
                else:
                    iv_seq = (invert_strand(co.ref_iv) for co in r.cigar
                              if (co.type in com and co.size > 0))
            else:
                if r[0] is not None and r[0].aligned:
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r[0].cigar
                                  if co.type in com and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                                  if co.type in com and co.size > 0)
                else:
                    iv_seq = tuple()
                if r[1] is not None and r[1].aligned:
                    if stranded != "reverse":
                        iv_seq = itertools.chain(
                            iv_seq, (invert_strand(co.ref_iv)
                                     for co in r[1].cigar
                                     if co.type in com and co.size > 0))
                    else:
                        iv_seq = itertools.chain(
                            iv_seq, (co.ref_iv for co in r[1].cigar
                                     if co.type in com and co.size > 0))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        write_to_samout(r, "__not_aligned", samoutfile,
                                        template)
                        notaligned += 1
                        continue
                if secondary_alignment_mode == 'ignore':
                    if (r[0] is not None) and r[0].not_primary_alignment:
                        continue
                    elif (r[1] is not None) and r[1].not_primary_alignment:
                        continue
                if supplementary_alignment_mode == 'ignore':
                    if (r[0] is not None) and r[0].supplementary:
                        continue
                    elif (r[1] is not None) and r[1].supplementary:
                        continue
                try:
                    if ((r[0] is not None and r[0].optional_field("NH") > 1) or
                        (r[1] is not None and r[1].optional_field("NH") > 1)):
                        nonunique += 1
                        write_to_samout(r, "__alignment_not_unique",
                                        samoutfile, template)
                        if multimapped_mode == 'none':
                            continue
                except KeyError:
                    pass
                if ((r[0] and r[0].aQual < minaqual)
                        or (r[1] and r[1].aQual < minaqual)):
                    lowqual += 1
                    write_to_samout(r, "__too_low_aQual", samoutfile, template)
                    continue

            try:
                if overlap_mode == "union":
                    fs = set()
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            fs = fs.union(fs2)
                elif overlap_mode in ("intersection-strict",
                                      "intersection-nonempty"):
                    fs = None
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            if ((len(fs2) > 0) or
                                (overlap_mode == "intersection-strict")):
                                if fs is None:
                                    fs = fs2.copy()
                                else:
                                    fs = fs.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")

                if fs is None or len(fs) == 0:
                    write_to_samout(r, "__no_feature", samoutfile, template)
                    empty += 1
                elif len(fs) > 1:
                    write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]",
                                    samoutfile, template)
                    ambiguous += 1
                else:
                    write_to_samout(r, list(fs)[0], samoutfile, template)

                if fs is not None and len(fs) > 0:
                    if multimapped_mode == 'none':
                        if len(fs) == 1:
                            counts[list(fs)[0]] += 1
                    elif multimapped_mode == 'all':
                        for fsi in list(fs):
                            counts[fsi] += 1
                    elif multimapped_mode == 'fraction':
                        for fsi in list(fs):
                            counts[fsi] += 1.0 / len(fs)
                    elif multimapped_mode == 'random':
                        fsi = random.choice(fs)
                        counts[fsi] += 1
                    else:
                        sys.exit("Illegal multimap mode.")

            except UnknownChrom:
                write_to_samout(r, "__no_feature", samoutfile, template)
                empty += 1

    except:
        sys.stderr.write("Error occured when processing input (%s):\n" %
                         (read_seq_file.get_line_number_string()))
        raise

    if not quiet:
        sys.stderr.write(
            "%d %s processed.\n" %
            (i, "alignments " if not pe_mode else "alignment pairs"))
        sys.stderr.flush()

    if samoutfile is not None:
        samoutfile.close()

    return {
        'isam': isam,
        'counts': counts,
        'empty': empty,
        'ambiguous': ambiguous,
        'lowqual': lowqual,
        'notaligned': notaligned,
        'nonunique': nonunique,
    }
def main():
    exe_parser = argparse.ArgumentParser()
    exe_parser.add_argument('infile', type=str, help='<input file> [(full path), -b/-s required]')
    exe_parser.add_argument("-u", "--not_aligned",
                            help="output reads that were not aligned, including those that were aligned multiple times(flat file).",
                            type=str)
    exe_parser.add_argument("-s", "--samout", help="output not aligned reads to [file path].", type=str)
    exe_parser.add_argument("-b", "--ambiguous_out", help="output a fasta file of ambiguous hits [file path].",
                            type=str)
    exe_parser.add_argument("-v", "--verbose", help="verbose. (default = TRUE).", action="store_true")
    exe_parser.add_argument("gff", help="<gff file> [(full path)]", type=str)
    exe_parser.add_argument("-f", "--fasta", help="output fasta file of hits (full path).", type=str)
    exe_parser.add_argument("-m", "--min_read_length", help="minimal read length to consider. (default = 60b).",
                            type=int)
    exe_parser.add_argument("-i", "--min_id", help="minimal percent id of hit to consider. (default = 80).", type=int)
    exe_parser.add_argument("-z", "--min_score", help="minimal aligner score to consider. (default = 0).", type=int)
    exe_parser.add_argument("-c", "--max_clip",
                            help="proportion of bases clipped from read for alignment. (default = 0.3).", type=float)
    exe_parser.add_argument("--stranded", help="whether the data is stranded (y, n, reverse). (default = n).", type=str,
                            choices=["y", "n", "reverse"], default="n")
    exe_parser.add_argument("--idattr", help="GFF attribute to be used as feature ID. (default = GeneID).", type=str)
    exe_parser.add_argument("--type", help="feature type (3rd column in GFF file) to be used. (default = CDS).",
                            type=str)
    exe_parser.add_argument("-a", "--minaqual", help="min. alignment quality (default = 0).", type=str)
    exe_parser.add_argument("-p", "--paired_end_mode",
                            help="input is paired end sorted by name (n) or position (p) . (default = p).", type=str,
                            choices=["p", "n"], default="p")
    exe_parser.add_argument("-o", "--out", help="name of counts output file.", type=str)
    args = exe_parser.parse_args()

    if args.paired_end_mode == 'p':
        paired_end = True
        pe_order = 'p'
    elif args.paired_end_mode == 'n':
        paired_end = True
        pe_order = 'n'

    if args.infile:
        try:
            if args.infile == '-':  # get sam on a stream
                seqfile = HTSeq.SAM_Reader(sys.stdin)
                if args.paired_end_mode:
                    # read_seq_iter = iter(seqfile)
                    # first_read = read_seq_iter.next()
                    # read_seq = itertools.chain([first_read], read_seq_iter)
                    # reader = HTSeq.pair_SAM_alignments(read_seq)
                    if pe_order == 'p':
                        reader = HTSeq.pair_SAM_alignments_with_buffer(seqfile)
                    elif pe_order == 'n':
                        reader = HTSeq.pair_SAM_alignments(seqfile)  # (read_seq)
                else:
                    reader = seqfile
            elif args.infile != '-':
                seqfile = HTSeq.SAM_Reader(args.infile)
                if args.paired_end_mode:
                    read_seq_iter = iter(seqfile)
                    first_read = read_seq_iter.next()
                    read_seq = itertools.chain([first_read], read_seq_iter)
                    reader = HTSeq.pair_SAM_alignments(read_seq)
                    if pe_order == 'p':
                        reader = HTSeq.pair_SAM_alignments_with_buffer(reader)
                    elif pe_order == 'n':
                        reader = HTSeq.pair_SAM_alignments(reader)
                else:
                    reader = seqfile
                    # fread_seq_iter = iter(reader)
                    # first_read = iter(read_seq).next()
            elif args.infile == '':
                print "no input file type given. exiting..."
                sys.exit(1)
        except:
            print "failed processing SAM/BAM file"
            raise
    elif not args.infile:
        print "no input file given. exiting..."
        sys.exit(1)

    if args.gff:
        gff_file = args.gff
    else:
        print "no gff file given. exiting..."
        sys.exit(1)

    if args.verbose:
        verbose = True
    else:
        verbose = False

    if args.min_read_length:
        min_read_len = args.min_read_length
    else:
        min_read_len = 60  # default read length

    if args.max_clip:
        max_clip_ = float(args.max_clip)
    else:
        max_clip_ = float(0.3)  # default read length

    if args.min_id:
        min_id = float(args.min_id)
    else:
        min_id = float(80)

    if args.min_score:
        min_score = int(args.min_score)
    else:
        min_score = 0

    if args.stranded == 'n':
        stranded = 'no'
    elif args.stranded == 'y':
        stranded = 'yes'
    elif args.stranded == 'reverse':
        stranded = 'reverse'

    if args.minaqual:
        minaqual = args.minaqual
    else:
        minaqual = 0

    if args.idattr:
        id_attribute = args.idattr
    else:
        id_attribute = "GeneID"
    if args.type:
        feature_type = args.type
    else:
        feature_type = 'CDS'

    # ###
    # parse GFF file
    features, counts = gff_reader(gff_file, feature_type, id_attribute, verbose, stranded)
    # ###
    if args.samout:
        samoutfile = open(args.samout, "w")
    else:
        samoutfile = None
    if args.ambiguous_out:
        ambiguousfile = open(args.ambiguous_out, "w")
    else:
        ambiguousfile = None
    if args.fasta:
        fastafile = open(args.fasta, "w")
    else:
        fastafile = None
    if args.not_aligned:
        not_aligned_file = open(args.not_aligned, "w")
    else:
        not_aligned_file = None
    if args.out:
        outfile = open(args.out, "w")
    else:
        outfile = None

        # if outfile and samoutfile and  ambiguousfile and fastafile and not_aligned_file == None:
        # print "None of the possible output file options specified. exiting..."
        # sys.exit(1)
    # #######
    # decalre counter variables
    empty = 0
    ambiguous = 0
    notaligned = 0
    lowqual = 0
    nonunique = 0
    # #######

    read_counter = 0
    for alignment in reader:  # for alignment entry (line in fact) in sam file
        # iv_seq
        # print alignment
        if not paired_end:
            if read_counter % 1000000 == 0 and verbose:
                if verbose:
                    print read_counter, 'non paired-end alignments processed'
            read_name = alignment.read.name
            # read = alignment.read  # READ. Note that def invert_strand( iv ):
            read_seq = alignment.read.seq
            read_length = len(alignment.read.seq)
            if not alignment.aligned:  # check if read is aligned to ref sequence
                if alignment is not None:
                    notaligned += 1
                    if args.samout:
                        write_to_samout(samoutfile, paired_end, alignment, "not_aligned")
                    if args.not_aligned:
                        not_aligned_file.write(read_name + '\t' + 'not_aligned' + '\n')
                        # continue
            elif alignment.aligned:

                opt_fields = alignment.optional_fields
                # flag = alignment.flag
                cigar_string = parse_cigar(alignment.original_sam_line.split('\t')[
                    5])  # just the cigar string without the fancy HTseq additions
                cigar_soft_clipped, cigar_m, cigar_insertions, cigar_deletions, cigar_insertions = parse_cigar_alignment(cigar_string)  # get alignment data from cigar string
                score, md_matches, md_deletions, md_mismatches = parse_opt_fields(
                    opt_fields)  # get alignment data from md string
                percent_id = 100.0 * (
                    float(md_matches) / (float(read_length - cigar_soft_clipped + cigar_insertions + cigar_deletions)))
                if alignment[0] is not None:  # check if read is aligned to ref sequence
                    if alignment.optional_field("NH") > 1:  # check if read is mapped more than once
                        # By default these reads are discarded. CHANGE?
                        if args.samout:
                            write_to_samout(samoutfile, paired_end, alignment, "alignment_not_unique")
                        nonunique += 1
                        if args.not_aligned:
                            not_aligned_file.write(read_name + '\t' + 'alignment_not_unique' + '\n')
                            # continue
                    if alignment.aQual < minaqual:  # check quality. default is 0
                        lowqual += 1
                        if args.samout:
                            write_to_samout(samoutfile, paired_end, alignment, "too_low_aQual")
                        if args.not_aligned:
                            not_aligned_file.write(read_name + '\t' + 'too_low_aQual' + '\n')
                            # continue
                    clipped = (float(cigar_soft_clipped) / float(read_length))
                    if read_length >= min_read_len:
                        if (float(cigar_soft_clipped) / float(read_length)) <= max_clip_:
                            if score >= args.min_score:
                                if percent_id >= float(min_id):
                                    if stranded == "reverse":
                                        iv_seq = (
                                            (invert_strand(cigar_operation.ref_iv) for cigar_operation in
                                             alignment[1].cigar
                                             if cigar_operation.type == "M" and cigar_operation.size > 0))
                                    else:
                                        iv_seq = (cigar_operation.ref_iv for cigar_operation in alignment.cigar if
                                                  cigar_operation.type == "M" and cigar_operation.size > 0)
                                    iv_seq_good = True
                                    # collects hits to chromosomes/features.
                                    """
                                    cigarOperation in HTSeq:
                                    HTSeq.parse_cigar( "20M6I10M", 1000, "chr2", "+" ) #ref_iv == genomicInterval object
                                    of htSeq
                                    [< CigarOperation: 20 base(s) matched on ref iv chr2:[1000,1020)/+,query iv[0,20)>,
                                    < CigarOperation: 6 base(s) inserted on ref iv chr2:[1020,1020)/+,query iv[20,26)>,]
                                    """
                                    # if args.fasta:
                                    # fastafile.write('>' + read_name + '\n' + read_seq + '\n')

                                else:
                                    iv_seq_good = False
                                    if args.samout:
                                        write_to_samout(samoutfile, paired_end, alignment,
                                                        "percent_id_too_low=" + str(percent_id))
                                    if args.not_aligned:
                                        not_aligned_file.write(
                                            read_name + '\t' + 'percent_id_too_low=' + str(percent_id) + '\n')
                            else:
                                iv_seq_good = False
                                if args.samout:
                                    write_to_samout(samoutfile, paired_end, alignment,
                                                    'alignment_score_too_low=' + str(score))
                                if args.not_aligned:
                                    not_aligned_file.write(
                                        read_name + '\t' + 'alignment_score_too_low=' + str(score) + '\n')
                        else:
                            iv_seq_good = False
                            if args.samout:
                                write_to_samout(samoutfile, paired_end, alignment,
                                                'too_many_bases_clipped_from_read=' + str(cigar_soft_clipped))
                            if args.not_aligned:
                                not_aligned_file.write(read_name + '\t' + 'too_many_bases_clipped_from_read=' + str(
                                    cigar_soft_clipped) + '\n')
        elif paired_end:
            # print "read counter=", read_counter
            if read_counter % 100000 == 0 and verbose:
                if verbose:
                    print read_counter, 'alignment pairs processed'
            if (alignment[0] is None) or not alignment[0].aligned:
                notaligned += 1
                try:
                    read_1_name = alignment[0].read.name
                except:
                    read_1_name = 'None'
                if args.samout:
                    write_to_samout(samoutfile, paired_end, alignment, "not_aligned")
                if args.not_aligned:
                    not_aligned_file.write(read_1_name + '\t' + 'not_aligned' + '\n')
            elif (alignment[1] is None) or not alignment[1].aligned:
                notaligned += 1
                try:
                    read_2_name = alignment[1].read.name
                except:
                    read_2_name = 'None'
                if args.samout:
                    write_to_samout(samoutfile, paired_end, alignment, "not_aligned")
                if args.not_aligned:
                    not_aligned_file.write(read_2_name + '\t' + 'not_aligned' + '\n')
            else:
                # else:
                read_1_name = alignment[0].read.name
                # read_1 = alignment[0].read  #READ.
                read_1_length = len(alignment[0].read.seq)
                read_1_seq = alignment[0].read.seq
                read_2_name = alignment[1].read.name
                # read_2 = alignment[1].read  #READ.
                # read_2_length = len(alignment[1].read.seq)
                read_2_seq = alignment[1].read.seq
                iv_seq = tuple()
                if (alignment[0] is not None) and alignment[0].aligned:  # check if read is aligned to ref sequence
                    opt_1_fields = alignment[0].optional_fields
                    # flag_1 = alignment[0].flag
                    cigar_1_string = parse_cigar(alignment[0].original_sam_line.split('\t')[
                        5])  # just the cigar string without the fancy HTseq additions
                    cigar_1_soft_clipped, cigar_1_m, cigar_1_insertions, cigar_1_deletions, cigar_1_insertions = parse_cigar_alignment(
                        cigar_1_string)
                    score_1, md_1_matches, md_1_deletions, md_1_mismatches = parse_opt_fields(
                        opt_1_fields)  # get alignment data from md string
                    percent_1_id = (100.0 * ((float(md_1_matches) / (
                        float(read_1_length - cigar_1_soft_clipped + cigar_1_insertions + cigar_1_deletions)))))
                    clipped_1 = (float(cigar_1_soft_clipped) / float(read_1_length))
                    if int(read_1_length) >= int(min_read_len):
                        if (float(cigar_1_soft_clipped) / float(read_1_length)) <= float(max_clip_):

                            # if int(score_1) >= int(args.min_score):
                            if int(score_1) >= int(min_score):
                                # if float(percent_1_id) >= float(args.min_id):
                                if float(percent_1_id) >= float(min_id):
                                    if stranded == "reverse":
                                        iv_seq = itertools.chain(iv_seq, (invert_strand(cigar_operation.ref_iv) for
                                                                          cigar_operation in alignment[0].cigar if
                                                                          cigar_operation.type == "M" and cigar_operation.size > 0))
                                    else:
                                        iv_seq = itertools.chain(iv_seq, (cigar_operation.ref_iv for cigar_operation in
                                                                          alignment[0].cigar if
                                                                          cigar_operation.type == "M" and cigar_operation.size > 0))
                                    # if args.fasta:
                                    # fastafile.write('>' + read_1_name + '\n' + read_1_seq + '\n')
                                    iv_seq_good_1 = True

                                else:
                                    iv_seq_good_1 = False
                                    if args.samout:
                                        write_to_samout(samoutfile, paired_end, alignment,
                                                        "percent_id_too_low=" + str(percent_1_id))
                                    if args.not_aligned:
                                        not_aligned_file.write(
                                            read_1_name + '\t' + 'percent_id_too_low=' + str(percent_1_id) + '\n')
                            else:
                                iv_seq_good_1 = False
                                if args.samout:
                                    write_to_samout(samoutfile, paired_end, alignment,
                                                    'alignment_score_too_low=' + str(score_1))
                                if args.not_aligned:
                                    not_aligned_file.write(
                                        read_1_name + '\t' + 'alignment_score_too_low=' + str(score_1) + '\n')
                        else:
                            iv_seq_good = False
                            if args.samout:
                                write_to_samout(samoutfile, paired_end, alignment,
                                                'too_many_bases_clipped_from_read=' + str(cigar_1_soft_clipped))
                            if args.not_aligned:
                                not_aligned_file.write(read_1_name + '\t' + 'too_many_bases_clipped_from_read=' + str(
                                    cigar_1_soft_clipped) + '\n')
                # else:
                # iv_seq = tuple()

                if (alignment[1] is not None) and alignment[1].aligned:  # check if read is aligned to ref sequence
                    opt_2_fields = alignment[1].optional_fields
                    # flag_2 = alignment[1].flag  # ',  #'bit_length', 'conjugate', 'denominator', 'imag', 'numerator', 'real']
                    cigar_2_string = parse_cigar(alignment[1].original_sam_line.split('\t')[
                        5])  # just the cigar string without the fancy HTseq additions
                    cigar_2_soft_clipped, cigar_2_m, cigar_2_insertions, cigar_2_deletions, cigar_2_insertions = parse_cigar_alignment(
                        cigar_2_string)
                    score_2, md_2_matches, md_2_deletions, md_2_mismatches = parse_opt_fields(
                        opt_2_fields)  # get alignment data from md string
                    read_2_name = alignment[1].read.name
                    read_2_length = len(alignment[1].read.seq)
                    # read_2 = alignment[1].read  # READ.
                    read_2_seq = alignment[1].read.seq
                    percent_2_id = (100.0 * (float(md_2_matches) / (
                        float(read_2_length - cigar_2_soft_clipped + cigar_2_insertions + cigar_2_deletions))))
                    clipped_2 = (float(cigar_2_soft_clipped) / float(read_2_length))
                    if int(read_2_length) >= int(min_read_len):
                        if (float(cigar_2_soft_clipped) / float(read_2_length)) <= float(max_clip_):
                            if int(score_2) >= int(min_score):
                                if float(percent_2_id) >= float(min_id):
                                    if stranded == "reverse":
                                        iv_seq = itertools.chain(iv_seq, (invert_strand(cigar_operation.ref_iv) for
                                                                          cigar_operation in alignment[1].cigar if
                                                                          cigar_operation.type == "M" and cigar_operation.size > 0))
                                    else:
                                        iv_seq = itertools.chain(iv_seq, (cigar_operation.ref_iv for cigar_operation in
                                                                          alignment[1].cigar if
                                                                          cigar_operation.type == "M" and cigar_operation.size > 0))
                                        iv_seq_good_2 = True
                                    try:
                                        if (alignment[0].optional_field("NH") > 1) or (alignment[1].optional_field(
                                                "NH") > 1):
                                            # or (alignment[1].optional_field("NH") > 1): #check if read is mapped more
                                            # than once
                                            # By default these reads are discarded. CHANGE?
                                            iv_seq_good_1 = False
                                            iv_seq_good_2 = False
                                            if args.samout:
                                                write_to_samout(samoutfile, paired_end, alignment,
                                                                "alignment_not_unique")
                                                nonunique += 1
                                            if args.not_aligned:
                                                not_aligned_file.write(read_1_name + '\t' + 'not_aligned' + '\n')
                                                not_aligned_file.write(read_2_name + '\t' + 'not_aligned' + '\n')
                                                continue
                                    except KeyError:
                                        pass
                                    if (alignment[0] and alignment[0].aQual < minaqual) or (alignment[1] and alignment[1].aQual < minaqual):
                                        # check quality. default is 0
                                        iv_seq_good_2 = False
                                        lowqual += 1
                                        if args.samout:
                                            write_to_samout(samoutfile, paired_end, alignment, "too_low_aQual")
                                        if args.not_aligned:
                                            not_aligned_file.write(read_1_name + '\t' + 'not_aligned' + '\n')
                                            not_aligned_file.write(read_2_name + '\t' + 'not_aligned' + '\n')
                                        continue
                                else:
                                    iv_seq_good_2 = False
                                    if args.samout:
                                        write_to_samout(samoutfile, paired_end, alignment,
                                                        "percent_id_too_low=" + str(percent_2_id))
                                    if args.not_aligned:
                                        not_aligned_file.write(
                                            read_2_name + '\t' + 'percent_id_too_low=' + str(percent_2_id) + '\n')
                            else:
                                iv_seq_good_2 = False
                                if args.samout:
                                    write_to_samout(samoutfile, paired_end, alignment,
                                                    'alignment_score_too_low=' + str(score_2))
                                if args.not_aligned:
                                    not_aligned_file.write(
                                        read_2_name + '\t' + 'alignment_score_too_low=' + str(score_2) + '\n')
                        else:
                            iv_seq_good_2 = False
                            if args.samout:
                                write_to_samout(samoutfile, paired_end, alignment,
                                                'too_many_bases_clipped_from_read=' + str(cigar_2_soft_clipped))
                            if args.not_aligned:
                                not_aligned_file.write(read_2_name + '\t' + 'too_many_bases_clipped_from_read=' + str(
                                    cigar_2_soft_clipped) + '\n')
        read_counter += 1

        """
        overlap_mode == "union"
        will count a hit even if read is mapped across an intron or there is an insertion.
        """
        try:
            feature_set = set()
            for iv in iv_seq:
                # print iv
                if iv.chrom not in features.chrom_vectors:  # check if alignment feaure name in features from GFF file
                    # The name of a sequence (i.e., chromosome, contig, or the like).
                    # check the gff features dictionary
                    raise UnknownChrom
                for iv2, fs2 in features[iv].steps():  # fs == feature steps.
                    """
                    from HTseq manual:
                    GenomicArray objects use by default so-called StepVectors that store the data internally in steps of
                    constant value
                    """
                    feature_set = feature_set.union(fs2)
                    # print feature_set
            if feature_set is None or len(feature_set) == 0:
                if args.samout:
                    write_to_samout(samoutfile, paired_end, alignment, "no_feature")
                if args.not_aligned:
                    not_aligned_file.write('None' + '\t' + 'no_feature' + '\n')
                empty += 1
            elif len(feature_set) > 1:
                if args.samout:
                    write_to_samout(samoutfile, paired_end, alignment, "ambiguous[" + '+'.join(feature_set) + "]")
                if ambiguousfile:
                    if paired_end:
                        if iv_seq_good_1:
                            ambiguousfile.write('>' + read_1_name + '_' + "ambiguous[" + '+'.join(
                                feature_set) + "]" + '_clipped_' + str(clipped_1) + '_score_' + str(score_2) + '_percent_id_' + str(percent_1_id) + '\n' + read_1_seq + '\n')
                        if iv_seq_good_2:
                            ambiguousfile.write('>' + read_2_name + '_' + "ambiguous[" + '+'.join(
                                feature_set) + "]" + '_clipped_' + str(clipped_2) + '_score_' + str(score_2) + '_percent_id_' + str(percent_2_id) + '\n' + read_2_seq + '\n')
                    else:
                        if iv_seq_good:
                            ambiguousfile.write('>' + alignment.read.name + '_' + "ambiguous[" + '+'.join(
                                feature_set) + "]" + '_clipped_' + str(clipped) + '_score_' + str(score) + '_percent_id_' + str(percent_id) + '\n' + read_seq + '\n')

                """
                #if args.not_aligned:
                #    if paired_end:
                #    not_aligned_file.write(alignment[0].read.name + '\t' + 'ambiguous['+'+'.join(feature_set)+']' + '\n')
                #        not_aligned_file.write(alignment[1].read.name + '\t' + 'ambiguous['+'+'.join(feature_set)+']' + '\n')
                #    else:
                #    not_aligned_file.write(alignment.read.name + '\t' + 'ambiguous['+'+'.join(feature_set)+']' + '\n')
                """
                ambiguous += 1
            elif len(feature_set) == 1:
                if args.samout:
                    write_to_samout(samoutfile, paired_end, alignment, list(feature_set)[0])
                if args.fasta:
                    if paired_end:
                        if iv_seq_good_1:
                            fastafile.write('>' + read_1_name + '_' + ''.join(list(feature_set)[0]) + '_clipped_' + str(
                                clipped_1) + '_score_' + str(score_1) + '_percent_id_' + str(percent_1_id) + '\n' + read_1_seq + '\n')
                        if iv_seq_good_2:
                            fastafile.write('>' + read_2_name + '_' + ''.join(list(feature_set)[0]) + '_clipped_' + str(
                                clipped_2) + '_score_' + str(score_2) + '_percent_id_' + str(percent_2_id) + '\n' + read_2_seq + '\n')
                    else:
                        if iv_seq_good:
                            fastafile.write('>' + read_name + '_' + ''.join(list(feature_set)[0]) + '_clipped_' + str(
                                clipped) + '_score_' + str(score) + '_percent_id_' + str(percent_id) + '\n' + read_seq + '\n')

                counts[list(feature_set)[0]] += 1
        except:
            if args.samout:
                write_to_samout(samoutfile, paired_end, alignment, "__no_feature")
            empty += 1

            # if not paired_end:
            # al = alignment
            # else:
            # al = alignment[0] if alignment[0] is not None else alignment[1]

            # if args.not_aligned:
            # not_aligned_file.write(al.read.name + '\t' + 'feature_not_in_gff_file' + '\n')
            # if not verbose:
            #    print (("Warning: Skipping read '%s', because chromosome " +
            #    "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) %
            #     (al.read.name, iv.chrom) )
    print 'total', read_counter, 'alignments processed'
    if samoutfile is not None:
        samoutfile.close()
    if fastafile is not None:
        fastafile.close
    if not_aligned_file is not None:
        not_aligned_file.close()

    if outfile is not None:
        for feature in sorted(counts.keys()):
            outfile.write("%s\t%d\n" % (feature, counts[feature]))
        outfile.write("no_feature\t%d\n" % empty)
        outfile.write("ambiguous\t%d\n" % ambiguous)
        outfile.write("too_low_aQual\t%d\n" % lowqual)
        outfile.write("not_aligned\t%d\n" % notaligned)
        outfile.write("alignment_not_unique\t%d\n" % nonunique)
    if outfile is not None:
        outfile.close()
示例#30
0
def count_reads_in_features(sam_filename,
                            gff_filename,
                            samtype,
                            order,
                            stranded,
                            overlap_mode,
                            feature_type,
                            id_attribute,
                            quiet,
                            minaqual,
                            samout,
                            include_non_annotated=False,
                            htseq_no_ambiguous=True):
    """
    This is taken from the function count_reads_in_features() from the 
    script htseq-count in the HTSeq package version 0.61.p2 
    The reason to do so is to fix two really small bugs related to the SAM output.
    The code of the function is small and simple so for now we
    will use the patched function here. A patch request has been sent
    to the HTSeq team.
    The description of the parameters are the same as htseq-count.
    Two parameters were added to filter out what to write in the sam output
    
    The HTSEQ License
    HTSeq is free software: you can redistribute it and/or modify it under the terms of 
    the GNU General Public License as published by the Free Software Foundation, 
    either version 3 of the License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful, 
    but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 
    or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

    The full text of the GNU General Public License, version 3, 
    can be found here: http://www.gnu.org/licenses/gpl-3.0-standalone.html
    """
    # Set up the filters
    count_reads_in_features.filter_htseq = \
    ["__too_low_aQual", "__not_aligned", "__alignment_not_unique"]
    if not include_non_annotated:
        count_reads_in_features.filter_htseq.append("__no_feature")
    count_reads_in_features.filter_htseq_no_ambiguous = htseq_no_ambiguous

    # Open SAM output file
    flag_write = "wb" if samtype == "bam" else "wh"
    flag_read = "rb" if samtype == "bam" else "r"
    saminfile = pysam.AlignmentFile(sam_filename, flag_read)
    count_reads_in_features.samoutfile = pysam.AlignmentFile(
        samout, flag_write, template=saminfile)
    saminfile.close()
    # Counter of annotated records
    count_reads_in_features.annotated = 0

    # Function to write to SAM output
    def write_to_samout(r, assignment):
        if not pe_mode:
            r = (r, )
        for read in r:
            if read is not None and assignment not in count_reads_in_features.filter_htseq \
            and not (count_reads_in_features.filter_htseq_no_ambiguous and assignment.find("__ambiguous") != -1):
                sam_record = read.to_pysam_AlignedRead(
                    count_reads_in_features.samoutfile)
                sam_record.set_tag("XF", assignment, "Z")
                count_reads_in_features.samoutfile.write(sam_record)
                count_reads_in_features.annotated += 1

    # Annotation objects
    features = HTSeq.GenomicArrayOfSets("auto", stranded != "no")
    counts = {}
    gff = HTSeq.GFF_Reader(gff_filename)

    try:
        for f in gff:
            if f.type == feature_type:
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    raise ValueError, ("Feature %s does not contain a '%s' attribute" \
                                       % (f.name, id_attribute))
                if stranded != "no" and f.iv.strand == ".":
                    raise ValueError, ("Feature %s at %s does not have strand information but you are " \
                                       "running htseq-count in stranded mode. Use '--stranded=no'." %
                                       (f.name, f.iv))
                features[f.iv] += feature_id
                counts[f.attr[id_attribute]] = 0
    except:
        raise

    if len(counts) == 0:
        raise RuntimeError, "No features of type '%s' found.\n" % feature_type

    if samtype == "sam":
        SAM_or_BAM_Reader = HTSeq.SAM_Reader
    elif samtype == "bam":
        SAM_or_BAM_Reader = HTSeq.BAM_Reader
    else:
        raise ValueError, "Unknown input format %s specified." % samtype

    try:
        read_seq_file = SAM_or_BAM_Reader(sam_filename)
        read_seq = read_seq_file
        first_read = iter(read_seq).next()
        pe_mode = first_read.paired_end
    except:
        raise RuntimeError, "Error occurred when reading beginning of SAM/BAM file."

    try:
        if pe_mode:
            if order == "name":
                read_seq = HTSeq.pair_SAM_alignments(read_seq)
            elif order == "pos":
                read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq)
            else:
                raise ValueError, "Illegal order specified."

        for r in read_seq:

            if not pe_mode:
                if not r.aligned:
                    write_to_samout(r, "__not_aligned")
                    continue

                try:
                    if r.optional_field("NH") > 1:
                        write_to_samout(r, "__alignment_not_unique")
                        continue
                except KeyError:
                    pass

                if r.aQual < minaqual:
                    write_to_samout(r, "__too_low_aQual")
                    continue

                if stranded != "reverse":
                    iv_seq = (co.ref_iv for co in r.cigar
                              if co.type == "M" and co.size > 0)
                else:
                    iv_seq = (invert_strand(co.ref_iv) for co in r.cigar
                              if co.type == "M" and co.size > 0)
            else:
                if r[0] is not None and r[0].aligned:
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r[0].cigar
                                  if co.type == "M" and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                                  if co.type == "M" and co.size > 0)
                else:
                    iv_seq = tuple()

                if r[1] is not None and r[1].aligned:
                    if stranded != "reverse":
                        iv_seq = itertools.chain(
                            iv_seq, (invert_strand(co.ref_iv)
                                     for co in r[1].cigar
                                     if co.type == "M" and co.size > 0))
                    else:
                        iv_seq = itertools.chain(
                            iv_seq, (co.ref_iv for co in r[1].cigar
                                     if co.type == "M" and co.size > 0))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        write_to_samout(r, "__not_aligned")
                        continue
                try:
                    if (r[0] is not None and r[0].optional_field("NH") > 1) \
                    or (r[1] is not None and r[1].optional_field("NH") > 1):
                        write_to_samout(r, "__alignment_not_unique")
                        continue
                except KeyError:
                    pass

                if (r[0] and r[0].aQual < minaqual) or (r[1] and
                                                        r[1].aQual < minaqual):
                    write_to_samout(r, "__too_low_aQual")
                    continue

            try:
                if overlap_mode == "union":
                    fs = set()
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            fs = fs.union(fs2)
                elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
                    fs = None
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            if len(
                                    fs2
                            ) > 0 or overlap_mode == "intersection-strict":
                                if fs is None:
                                    fs = fs2.copy()
                                else:
                                    fs = fs.intersection(fs2)
                else:
                    raise RuntimeError, "Illegal overlap mode."

                if fs is None or len(fs) == 0:
                    write_to_samout(r, "__no_feature")
                elif len(fs) > 1:
                    write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]")
                else:
                    write_to_samout(r, list(fs)[0])

            except UnknownChrom:
                write_to_samout(r, "__no_feature")

    except:
        count_reads_in_features.samoutfile.close()
        raise

    count_reads_in_features.samoutfile.close()
    return count_reads_in_features.annotated