Пример #1
0
 def run(self):
     sequences = [ ]
     annotations = [ ]
     for filename in self.filenames:
         any = False
         if io.is_sequence_file(filename):
             sequences.append(filename)
             any = True
         if annotation.is_annotation_file(filename):
             annotations.append(filename)
             any = True            
         if not any:
             raise grace.Error(filename + ' is neither a sequence file nor an annotation file that nesoni can read.')
     
     if not sequences:
         assert not annotations, 'Annotations given without any reference sequences.'
         reference = Reference(self.output_dir, must_exist=True)        
     else:
         reference = Reference(self.output_dir, must_exist=False)        
         reference.set_sequences(sequences)
         reference.set_annotations(annotations)
     
     with legion.Stage() as stage:
         if self.genome:
             stage.process(reference.build_genome, self.genome_select)
         if config.apply_ifavailable_program(self.bowtie, 'bowtie2-build'):
             stage.process(reference.build_bowtie_index)
         if config.apply_ifavailable_program(self.ls, 'gmapper-ls'):
             stage.process(reference.build_shrimp_mmap, False)
         if config.apply_ifavailable_program(self.cs, 'gmapper-cs'):
             stage.process(reference.build_shrimp_mmap, True)
         if config.apply_ifavailable_jar(self.snpeff, 'snpEff.jar'):
             stage.process(reference.build_snpeff)
Пример #2
0
 def run(self):
     sequences = [ ]
     annotations = [ ]
     for filename in self.filenames:
         any = False
         if io.is_sequence_file(filename):
             sequences.append(filename)
             any = True
         if annotation.is_annotation_file(filename):
             annotations.append(filename)
             any = True            
         if not any:
             raise grace.Error(filename + ' is neither a sequence file nor an annotation file that nesoni can read.')
     
     if not sequences:
         assert not annotations, 'Annotations given without any reference sequences.'
         reference = Reference(self.output_dir, must_exist=True)        
     else:
         reference = Reference(self.output_dir, must_exist=False)        
         reference.set_sequences(sequences)
         reference.set_annotations(annotations)
     
     with open(self.log_filename(),'wb') as f:
         if self.ls:
             reference.build_shrimp_mmap(False, f)
         if self.cs:
             reference.build_shrimp_mmap(True, f)
         if self.bowtie:
             reference.build_bowtie_index(f)
         if self.genome:
             reference.build_genome(self.genome_select)
         if self.snpeff:
             reference.build_snpeff()
Пример #3
0
    def run(self):
        base = os.path.split(self.prefix)[1]
        
        annotations = [ ]
        sequences = [ ]
        
        for filename in self.filenames:
            any = False
            if io.is_sequence_file(filename):
                sequences.append(filename)
                any = True
            if annotation.is_annotation_file(filename):
                annotations.append(filename)
                any = True
            assert any, 'File is neither a recognized sequence or annotation file'

        cytoband_filename = os.path.join(self.prefix,base+'_cytoband.txt')
        property_filename = os.path.join(self.prefix,'property.txt')
        gff_filename = os.path.join(self.prefix,base+'.gff')
        output_filenames = [ cytoband_filename, property_filename, gff_filename ] 

        if not os.path.exists(self.prefix):
            os.mkdir(self.prefix)
            
        f = open(property_filename,'wb')
        print >> f, 'ordered=true'
        print >> f, 'id=%s' % base
        print >> f, 'name=%s' % (self.name or base)
        print >> f, 'cytobandFile=%s_cytoband.txt' % base
        print >> f, 'geneFile=%s.gff' % base
        print >> f, 'sequenceLocation=%s' % base
        f.close()
        
        trivia.As_gff(output=gff_filename,
               filenames=annotations,
               exclude=[ 'gene', 'source' ]
        ).run()
        
        f_cyt = open(cytoband_filename,'wb')
        for filename in sequences:
            for name, seq in io.read_sequences(filename):
                assert '/' not in name
                f = open(os.path.join(self.prefix, name + '.txt'), 'wb')
                f.write(seq)
                f.close()
                print >> f_cyt, '%s\t0\t%d' % (name, len(seq))
        f_cyt.close()
        
        genome_filename = self.prefix + '.genome'
        if os.path.exists(genome_filename):
            os.unlink(genome_filename)
        io.execute(
            ['zip', '-j', io.abspath(genome_filename)] +
            [ io.abspath(item) for item in output_filenames ]
        )
        for filename in output_filenames:
            if os.path.exists(filename):
                os.unlink(filename)
Пример #4
0
 def run(self):
     sequences = [ ]
     annotations = [ ]
     for filename in self.filenames:
         any = False
         if io.is_sequence_file(filename):
             sequences.append(filename)
             any = True
         if annotation.is_annotation_file(filename):
             annotations.append(filename)
             any = True            
         if not any:
             raise grace.Error(filename + ' is neither a sequence file nor an annotation file that nesoni can read.')
     
     reference = Reference(self.output_dir, must_exist=False)        
     reference.set_sequences(sequences)
     reference.set_annotations(annotations)
     if self.ls:
         reference.build_shrimp_mmap(False)
     if self.cs:
         reference.build_shrimp_mmap(True)
Пример #5
0
    def run(self):
        sequences = []
        annotations = []
        for filename in self.filenames:
            any = False
            if io.is_sequence_file(filename):
                sequences.append(filename)
                any = True
            if annotation.is_annotation_file(filename):
                annotations.append(filename)
                any = True
            if not any:
                raise grace.Error(
                    filename +
                    ' is neither a sequence file nor an annotation file that nesoni can read.'
                )

        reference = Reference(self.output_dir, must_exist=False)
        reference.set_sequences(sequences)
        reference.set_annotations(annotations)
        if self.ls:
            reference.build_shrimp_mmap(False)
        if self.cs:
            reference.build_shrimp_mmap(True)
Пример #6
0
def count_run(min_score, min_size, max_size, filter_mode, equalize, types,
              locii, qualifiers, use_strand, merge_filename, limit,
              output_prefix, filenames, log):

    if filter_mode == 'poly':
        use_bam_filename = 'alignments.bam'
        use_only_top = True
        use_only_monogamous = False
        expect_multiple_alignments = True
    elif filter_mode == 'mono':
        use_bam_filename = 'alignments.bam'
        use_only_top = True
        use_only_monogamous = True
        expect_multiple_alignments = True
    else:
        assert filter_mode == 'existing', 'Unrecognized filtering mode'
        use_bam_filename = 'alignments_filtered.bam'
        use_only_top = False
        use_only_monogamous = False
        expect_multiple_alignments = False

    types = types.lower().split(',')

    qualifiers = qualifiers.split(',')

    if locii:
        locii = locii.lower().split(',')
    else:
        locii = None

    assert use_strand is not None, 'You must now explicitly specify --strand'
    assert use_strand in ('pool', 'forward', 'reverse',
                          'both'), "Can't understand --strand specification."

    from Bio import Seq, SeqIO

    annotation_filenames = []
    bam_filenames = []
    for arg in filenames:
        if annotation.is_annotation_file(arg):
            annotation_filenames.append(arg)
        else:
            bam_filenames.append(arg)

    n_samples = len(bam_filenames)
    titles = bam_filenames[:]
    for i in xrange(len(bam_filenames)):
        if os.path.isdir(bam_filenames[i]):
            titles[i] = os.path.basename(bam_filenames[i])
            if not annotation_filenames:
                working = working_directory.Working(bam_filenames[i])
                reference_filename = working.get_reference(
                ).annotations_filename()
                if reference_filename is not None:
                    annotation_filenames.append(reference_filename)

            bam_filenames[i] = os.path.join(bam_filenames[i], use_bam_filename)

    assert bam_filenames, 'No reference alignments given'

    merge = {}
    merge_qualifiers = {}
    if merge_filename is not None:
        #First line gives qualifiers
        #remaining lines give <qualifier> <qualifier...> <gene> <transcript> <transcript...>

        f = open(merge_filename, 'rU')
        qualifiers = f.readline().rstrip('\n').split('\t')
        for line in f:
            parts = line.rstrip('\n').split('\t')
            if not parts: continue
            for name in parts[len(qualifiers) + 1:]:
                assert name not in merge, 'Duplicate feature name in merge file'
                merge[name] = parts[len(qualifiers)]
                merge_qualifiers[name] = parts[:len(qualifiers)]
        f.close()

    genes = {}  # reference name -> gene index

    feature_names = {}  # feature_name -> number of occurrences

    features = []

    n_features = 0

    chromosome_length = {}
    for filename in bam_filenames:
        headers = sam.bam_headers(filename)
        for line in headers.split('\n'):
            if not line: continue
            parts = line.split('\t')
            if parts[0] != '@SQ': continue

            name = None
            length = None
            for part in parts[1:]:
                if part.startswith('SN:'): name = part[3:]
                if part.startswith('LN:'): length = int(part[3:])
            assert name is not None and length is not None

            if name in chromosome_length:
                assert chromosome_length[name] == length
            else:
                chromosome_length[name] = length

    for name in chromosome_length:
        genes[name] = span_index.Span_index()

    if annotation_filenames:
        assert not merge, 'Merging not supported with annotation files'

        for filename in annotation_filenames:
            for feature in annotation.read_annotations(filename):
                if feature.type.lower() not in types: continue

                if (locii is not None and
                    ('locus_tag' not in feature.attr
                     or feature.attr['locus_tag'].lower() not in locii)):
                    continue

                f = Feature(n_samples)
                f.name = feature.get_id()
                if feature.type.lower() != 'cds' and len(types) > 1:
                    f.name = feature.type + ':' + f.name

                feature_names[f.name] = feature_names.get(f.name, 0) + 1
                if feature_names[f.name] > 1:
                    f.name += '/%d' % feature_names[f.name]

                f.qualifiers = [
                    feature.attr.get(item, '') for item in qualifiers
                ]

                f.length = feature.end - feature.start

                assert feature.seqid in genes, 'Annotation for sequence that is not in BAM files'
                genes[feature.seqid].insert(
                    Span_entry(feature.start, feature.end, feature.strand or 1,
                               f))
                features.append(f)

    else:
        # Sequences as features
        log.log(
            'No annotation files given or found, using sequences as features\n'
        )

        name_feature = {}  # (merged)name -> feature

        for name in chromosome_length:
            merged_name = merge.get(name, name)

            if merged_name not in name_feature:
                f = Feature(n_samples)
                f.name = merged_name
                f.length = length
                f.qualifiers = merge_qualifiers.get(name,
                                                    ('', ) * len(qualifiers))
                n_features += 1
                name_feature[merged_name] = f
                features.append(f)
            else:
                f = name_feature[merged_name]
                f.length = max(f.length, length)  #...

            genes[name].insert(Span_entry(0, chromosome_length[name], 1, f))

    log.log('%d features\n\n' % len(features))

    for name in genes:
        genes[name].prepare()

    n_fragments = [0] * n_samples
    n_fragments_aligned = [0] * n_samples
    n_low_score = [0] * n_samples
    n_something = [0] * n_samples
    n_multiple = [0] * n_samples
    n_span = [0] * n_samples

    for i in xrange(n_samples):
        for read_name, fragment_alignments, unmapped in sam.bam_iter_fragments(
                bam_filenames[i],
                'Counting sample %d of %d' % (i + 1, n_samples)):
            n_fragments[i] += 1

            if not fragment_alignments:
                continue

            n_fragments_aligned[i] += 1

            feature_hits = []  # [ [ (feature, strand) ] ]

            # Use only top scoring alignments
            fragment_scores = [
                sum(al.get_AS() for al in item) for item in fragment_alignments
            ]

            best_score = max(fragment_scores)

            if min_score is not None and best_score < min_score:
                n_low_score[i] += 1
                continue

            if use_only_top:
                cutoff = max(best_score, min_score)
            else:
                cutoff = min_score
            fragment_alignments = [
                item
                for item, score in zip(fragment_alignments, fragment_scores)
                if score >= cutoff
            ]

            for alignments in fragment_alignments:
                strand = -1 if alignments[0].flag & sam.FLAG_REVERSE else 1

                start = min(item.pos - 1 for item in alignments)
                end = max(item.pos + item.length - 1 for item in alignments)
                length = end - start
                if min_size is not None and length < min_size: continue
                if max_size is not None and length > max_size: continue

                rname = alignments[0].rname
                strand = -1 if alignments[0].flag & sam.FLAG_REVERSE else 1
                assert alignments[
                    0].rname in genes, 'Alignment refers to sequence not present in GENBANK file'

                this_feature_hits = []
                for item in genes[rname].get(start, end):
                    rel_strand = strand * item.strand
                    key = (item.feature, rel_strand)
                    if key in this_feature_hits: continue
                    this_feature_hits.append(key)
                    if not use_only_monogamous or len(
                            fragment_alignments) == 1:
                        item.feature.count[rel_strand][i] += 1

                if this_feature_hits:
                    feature_hits.append(this_feature_hits)

                if len(this_feature_hits) > 1:
                    for a in this_feature_hits:
                        for b in this_feature_hits:
                            if a[0] is b[0]: continue
                            a[0].common[(a[1], b[1])][b[0]] += 1

            if len(feature_hits) > 0:
                n_something[i] += 1
            #else:
            #    print fragment_alignments
            #    print genes[fragment_alignments[0][0].rname].indexes
            #    print

            if len(feature_hits) > 1:
                n_multiple[i] += 1
                for j in xrange(len(feature_hits)):
                    for k in xrange(len(feature_hits)):
                        if j == k: continue
                        for a in feature_hits[j]:
                            for b in feature_hits[k]:
                                if a[0] is b[0]: continue
                                a[0].ambiguous[(a[1], b[1])][b[0]] += 1

            if any(len(item) > 1 for item in feature_hits): n_span[i] += 1

            if limit is not None and n_fragments[i] >= limit: break

        grace.status('')

        #log.log('%s\n' % titles[i])
        #log.log('%20s fragments\n' % grace.pretty_number(n_fragments[i]))
        #log.log('%20s fragments aligned to the reference\n' % grace.pretty_number(n_fragments_aligned[i]))
        #if n_low_score[i]:
        #    log.log('%20s had too low an alignment score, discarded\n' % grace.pretty_number(n_low_score[i]))
        #log.log('%20s aligned to an annotated gene\n' % grace.pretty_number(n_something[i]))
        #if expect_multiple_alignments or n_multiple[i]:
        #    log.log('%20s aligned to multiple genes\n' % grace.pretty_number(n_multiple[i]))
        #log.log('%20s had an alignment that spanned multiple genes\n' % grace.pretty_number(n_span[i]))
        #log.log('\n')

        log.datum(titles[i], 'fragments', n_fragments[i])
        log.datum(titles[i], 'fragments aligned to the reference',
                  n_fragments_aligned[i])
        if n_low_score[i]:
            log.datum(titles[i], 'had too low an alignment score, discarded',
                      n_low_score[i])
        log.datum(titles[i], 'aligned to an annotated gene', n_something[i])
        if expect_multiple_alignments or n_multiple[i]:
            log.datum(titles[i], 'aligned to multiple genes', n_multiple[i])
        log.datum(titles[i], 'had an alignment that spanned multiple genes',
                  n_span[i])
        log.log('\n')

    strandedness = []
    for feature in features:
        n_forward = sum(feature.count[1])
        n_reverse = sum(feature.count[-1])
        if n_forward + n_reverse < 5: continue
        strandedness.append(
            (n_forward - n_reverse) * 100.0 / (n_forward + n_reverse))
    strandedness = sum(strandedness) / len(strandedness)
    log.log(
        'Strand specificity: %.0f%%\n'
        '  (~ -100%% reverse strand, ~ 0%% non-specific, ~ 100%% forward strand\n'
        '   Average over all features with at least 5 hits.)\n' % strandedness)

    if use_strand == 'pool':
        getters = [
            lambda f:
            (feature.name, add_lists(feature.count[1], feature.count[-1]),
             add_defdicts(feature.common[(1, 1)], feature.common[
                 (1, -1)], feature.common[(-1, 1)], feature.common[(-1, -1)]),
             add_defdicts(feature.ambiguous[(1, 1)], feature.ambiguous[
                 (1, -1)], feature.ambiguous[(-1, 1)], feature.ambiguous[
                     (-1, -1)]))
        ]
    elif use_strand == 'forward':
        getters = [
            lambda f: (feature.name, feature.count[1], feature.common[
                (1, 1)], feature.ambiguous[(1, 1)])
        ]
    elif use_strand == 'reverse':
        getters = [
            lambda f: (feature.name, feature.count[-1], feature.common[
                (-1, -1)], feature.ambiguous[(-1, -1)])
        ]
    elif use_strand == 'both':
        getters = [
            lambda f: (feature.name, feature.count[1], feature.common[
                (1, 1)], feature.ambiguous[(1, 1)]), lambda f:
            (feature.name + 'r', feature.count[-1], feature.common[
                (-1, -1)], feature.ambiguous[(-1, -1)])
        ]

    total_hits = [0] * n_samples
    for feature in features:
        for getter in getters:
            total_hits = add_lists(total_hits, getter(feature)[1])

    if equalize:
        min_hits = min(total_hits)
        p = [float(min_hits) / item for item in total_hits]
        total_hits = [min_hits] * n_samples

    f = open(output_prefix + '.txt', 'wb')
    #log.attach(open(output_prefix + '_log.txt', 'wb'))

    print >> f, tab_encode(
        ['Feature'] + titles + ['RPKM ' + item for item in titles] +
        ['Length'] + qualifiers + ['On same fragment'] +
        (['Ambiguous alignment'] if expect_multiple_alignments else []))

    for feature in features:
        for getter in getters:
            feature_name, count, common, ambiguous = getter(feature)

            if equalize:
                count = [subsample(count[i], p[i]) for i in xrange(n_samples)]

            rpkm = [
                count[i] * 1.0e9 / feature.length / total_hits[i]
                for i in xrange(n_samples)
            ]

            common_str = ' '.join(
                '%dx%s' % (item[1], item[0]) for item in sorted(
                    common.items(), key=lambda item: item[1], reverse=True))
            ambiguous_str = ' '.join(
                '%dx%s' % (item[1], item[0]) for item in sorted(
                    ambiguous.items(), key=lambda item: item[1], reverse=True))

            print >> f, tab_encode(
                [feature_name] + [str(item) for item in count] +
                ['%.2f' % item for item in rpkm] + [str(feature.length)] +
                list(feature.qualifiers) + [common_str] +
                ([ambiguous_str] if expect_multiple_alignments else []))

    f.close()
Пример #7
0
def count_run(
    min_score, min_size, max_size, filter_mode, equalize, types, locii,
    qualifiers, use_strand, merge_filename, limit, output_prefix, filenames, log):
    
    if filter_mode == 'poly':
        use_bam_filename = 'alignments.bam'
        use_only_top = True
        use_only_monogamous = False
        expect_multiple_alignments = True
    elif filter_mode == 'mono': 
        use_bam_filename = 'alignments.bam'
        use_only_top = True
        use_only_monogamous = True
        expect_multiple_alignments = True
    else:
        assert filter_mode == 'existing', 'Unrecognized filtering mode'
        use_bam_filename = 'alignments_filtered.bam'
        use_only_top = False
        use_only_monogamous = False
        expect_multiple_alignments = False

    types = types.lower().split(',')

    qualifiers = qualifiers.split(',')

    if locii:
        locii = locii.lower().split(',')
    else:
        locii = None

    assert use_strand is not None, 'You must now explicitly specify --strand'
    assert use_strand in ('pool','forward','reverse','both'), "Can't understand --strand specification."
    
    from Bio import Seq, SeqIO

    annotation_filenames = [ ]
    bam_filenames = [ ]
    for arg in filenames:
        if annotation.is_annotation_file(arg):
            annotation_filenames.append(arg)
        else:
            bam_filenames.append(arg)    

    n_samples = len(bam_filenames)
    titles = bam_filenames[:]    
    for i in xrange(len(bam_filenames)):
        if os.path.isdir(bam_filenames[i]):
            titles[i] = os.path.basename(bam_filenames[i])
            if not annotation_filenames:
                working = working_directory.Working(bam_filenames[i])
                reference_filename = working.get_reference().annotations_filename()
                if reference_filename is not None:
                    annotation_filenames.append(reference_filename)
            
            bam_filenames[i] = os.path.join(bam_filenames[i], use_bam_filename)
    
    assert bam_filenames, 'No reference alignments given' 

    merge = { }
    merge_qualifiers = { }
    if merge_filename is not None:
        #First line gives qualifiers
        #remaining lines give <qualifier> <qualifier...> <gene> <transcript> <transcript...>
    
        f = open(merge_filename,'rU')
        qualifiers = f.readline().rstrip('\n').split('\t')
        for line in f:
            parts = line.rstrip('\n').split('\t')
            if not parts: continue
            for name in parts[len(qualifiers)+1:]:
                assert name not in merge, 'Duplicate feature name in merge file'
                merge[name] = parts[len(qualifiers)]
                merge_qualifiers[name] = parts[:len(qualifiers)]
        f.close()

    
    genes = { }  # reference name -> gene index
    
    feature_names = { }   # feature_name -> number of occurrences
    
    features = [ ]
    
    n_features = 0

    chromosome_length = { }
    for filename in bam_filenames:
        headers = sam.bam_headers(filename)
        for line in headers.split('\n'):
            if not line: continue
            parts = line.split('\t')
            if parts[0] != '@SQ': continue
            
            name = None
            length = None
            for part in parts[1:]:
                if part.startswith('SN:'): name = part[3:]
                if part.startswith('LN:'): length = int(part[3:])
            assert name is not None and length is not None
            
            if name in chromosome_length:
                assert chromosome_length[name] == length
            else:
                chromosome_length[name] = length
    
    for name in chromosome_length:
        genes[name] = span_index.Span_index()

    
    if annotation_filenames:
        assert not merge, 'Merging not supported with annotation files'
    
        for filename in annotation_filenames:
            for feature in annotation.read_annotations(filename):
                if feature.type.lower() not in types: continue
                
                if (locii is not None and
                    ('locus_tag' not in feature.attr or
                     feature.attr['locus_tag'].lower() not in locii)):
                    continue
                
                f = Feature(n_samples)
                f.name = feature.get_id()                
                if feature.type.lower() != 'cds' and len(types) > 1:
                    f.name = feature.type + ':' + f.name

                feature_names[f.name] = feature_names.get(f.name,0)+1
                if feature_names[f.name] > 1:
                   f.name += '/%d' % feature_names[f.name]
                   
                f.qualifiers = [ feature.attr.get(item,'') for item in qualifiers ]
                
                f.length = feature.end - feature.start

                assert feature.seqid in genes, 'Annotation for sequence that is not in BAM files'
                genes[feature.seqid].insert(Span_entry(feature.start, feature.end, feature.strand or 1, f))
                features.append(f)

    else:
        # Sequences as features        
        log.log('No annotation files given or found, using sequences as features\n')
        
        name_feature = { } # (merged)name -> feature
        
        for name in chromosome_length:
            merged_name = merge.get(name, name)
            
            if merged_name not in name_feature: 
                f = Feature(n_samples)
                f.name = merged_name
                f.length = length
                f.qualifiers = merge_qualifiers.get(name, ('',)*len(qualifiers))
                n_features += 1
                name_feature[merged_name] = f
                features.append(f)
            else:
                f = name_feature[merged_name]
                f.length = max(f.length, length) #...
            
            genes[name].insert(Span_entry(0, chromosome_length[name], 1, f))
                
                

    
    log.log('%d features\n\n' % len(features))

    for name in genes: 
        genes[name].prepare()
        
    n_fragments = [ 0 ] * n_samples
    n_fragments_aligned = [ 0 ] * n_samples
    n_low_score = [ 0 ] * n_samples
    n_something = [ 0 ] * n_samples
    n_multiple = [ 0 ] * n_samples
    n_span = [ 0 ] * n_samples

    for i in xrange(n_samples):
        for read_name, fragment_alignments, unmapped in sam.bam_iter_fragments(bam_filenames[i], 'Counting sample %d of %d' % (i+1,n_samples)):
            n_fragments[i] += 1
            
            if not fragment_alignments: 
                continue
            
            n_fragments_aligned[i] += 1
            
            feature_hits = [ ] # [ [ (feature, strand) ] ]
            
            # Use only top scoring alignments
            fragment_scores = [ sum( al.get_AS() for al in item ) for item in fragment_alignments ]
            
            best_score = max(fragment_scores)
            
            if min_score is not None and best_score < min_score:
                n_low_score[i] += 1
                continue
            
            if use_only_top:
                cutoff = max(best_score, min_score)
            else:
                cutoff = min_score            
            fragment_alignments = [ item 
                                    for item, score in zip(fragment_alignments, fragment_scores)
                                    if score >= cutoff ]            
            
            for alignments in fragment_alignments:
                strand = -1 if alignments[0].flag&sam.FLAG_REVERSE else 1
            
                start = min(item.pos-1 for item in alignments)
                end = max(item.pos+item.length-1 for item in alignments)
                length = end-start
                if min_size is not None and length < min_size: continue
                if max_size is not None and length > max_size: continue
                
                rname = alignments[0].rname
                strand = -1 if alignments[0].flag&sam.FLAG_REVERSE else 1
                assert alignments[0].rname in genes, 'Alignment refers to sequence not present in GENBANK file'
            
                this_feature_hits = [ ]    
                for item in genes[rname].get(start, end):
                    rel_strand = strand * item.strand
                    key = (item.feature, rel_strand)
                    if key in this_feature_hits: continue
                    this_feature_hits.append( key )
                    if not use_only_monogamous or len(fragment_alignments) == 1:
                        item.feature.count[rel_strand][i] += 1
                
                if this_feature_hits: 
                    feature_hits.append( this_feature_hits )
                
                if len(this_feature_hits) > 1:
                    for a in this_feature_hits:
                        for b in this_feature_hits:
                            if a[0] is b[0]: continue
                            a[0].common[(a[1],b[1])][b[0]] += 1
                                
                
            if len(feature_hits) > 0: 
                n_something[i] += 1
            #else:
            #    print fragment_alignments
            #    print genes[fragment_alignments[0][0].rname].indexes
            #    print
            
            
            if len(feature_hits) > 1: 
                n_multiple[i] += 1
                for j in xrange(len(feature_hits)):
                    for k in xrange(len(feature_hits)):
                        if j == k: continue
                        for a in feature_hits[j]:
                            for b in feature_hits[k]:
                                if a[0] is b[0]: continue
                                a[0].ambiguous[(a[1],b[1])][b[0]] += 1
            
            if any(len(item) > 1 for item in feature_hits): n_span[i] += 1
            
            if limit is not None and n_fragments[i] >= limit: break

        grace.status('')

        #log.log('%s\n' % titles[i])
        #log.log('%20s fragments\n' % grace.pretty_number(n_fragments[i]))
        #log.log('%20s fragments aligned to the reference\n' % grace.pretty_number(n_fragments_aligned[i]))
        #if n_low_score[i]:
        #    log.log('%20s had too low an alignment score, discarded\n' % grace.pretty_number(n_low_score[i]))
        #log.log('%20s aligned to an annotated gene\n' % grace.pretty_number(n_something[i]))
        #if expect_multiple_alignments or n_multiple[i]:
        #    log.log('%20s aligned to multiple genes\n' % grace.pretty_number(n_multiple[i]))
        #log.log('%20s had an alignment that spanned multiple genes\n' % grace.pretty_number(n_span[i]))
        #log.log('\n')

        log.datum(titles[i], 'fragments', n_fragments[i])
        log.datum(titles[i], 'fragments aligned to the reference', n_fragments_aligned[i])
        if n_low_score[i]:
            log.datum(titles[i], 'had too low an alignment score, discarded', n_low_score[i])
        log.datum(titles[i], 'aligned to an annotated gene', n_something[i])
        if expect_multiple_alignments or n_multiple[i]:
            log.datum(titles[i], 'aligned to multiple genes', n_multiple[i])
        log.datum(titles[i],'had an alignment that spanned multiple genes', n_span[i])
        log.log('\n')

    strandedness = [ ]
    for feature in features:
        n_forward = sum(feature.count[1])
        n_reverse = sum(feature.count[-1])
        if n_forward+n_reverse < 5: continue
        strandedness.append( (n_forward-n_reverse)*100.0 / (n_forward+n_reverse) )
    strandedness = sum(strandedness) / len(strandedness)
    log.log('Strand specificity: %.0f%%\n'
            '  (~ -100%% reverse strand, ~ 0%% non-specific, ~ 100%% forward strand\n'
            '   Average over all features with at least 5 hits.)\n'
            % strandedness)


    if use_strand == 'pool':
        getters = [ lambda f: (feature.name, 
                               add_lists(feature.count[1],feature.count[-1]),
                               add_defdicts(feature.common[(1,1)], 
                                            feature.common[(1,-1)], 
                                            feature.common[(-1,1)], 
                                            feature.common[(-1,-1)]),
                               add_defdicts(feature.ambiguous[(1,1)], 
                                            feature.ambiguous[(1,-1)], 
                                            feature.ambiguous[(-1,1)], 
                                            feature.ambiguous[(-1,-1)])) ]
    elif use_strand == 'forward':
        getters = [ lambda f: (feature.name, feature.count[1], feature.common[(1,1)], feature.ambiguous[(1,1)]) ]
    elif use_strand == 'reverse':
        getters = [ lambda f: (feature.name, feature.count[-1], feature.common[(-1,-1)], feature.ambiguous[(-1,-1)]) ]
    elif use_strand == 'both':
        getters = [ lambda f: (feature.name, feature.count[1], feature.common[(1,1)], feature.ambiguous[(1,1)]),
                    lambda f: (feature.name + 'r', feature.count[-1], feature.common[(-1,-1)], feature.ambiguous[(-1,-1)]) ]

    total_hits = [0] * n_samples
    for feature in features:
        for getter in getters:
            total_hits = add_lists(total_hits, getter(feature)[1])

    if equalize:
        min_hits = min(total_hits)
        p = [ float(min_hits)/item for item in total_hits ]
        total_hits = [ min_hits ] * n_samples

    f = open(output_prefix + '.txt', 'wb')
    #log.attach(open(output_prefix + '_log.txt', 'wb'))

    print >> f, tab_encode(
        [ 'Feature' ] +
        titles +
        [ 'RPKM ' + item for item in titles ] +
        [ 'Length' ] +
        qualifiers +
        [ 'On same fragment' ] +
        ([ 'Ambiguous alignment' ] if expect_multiple_alignments else [ ])
    )
    
    for feature in features:
        for getter in getters:
            feature_name, count, common, ambiguous = getter(feature)
            
            if equalize:
                count = [
                    subsample(count[i], p[i])
                    for i in xrange(n_samples)
                ]
            
            rpkm = [ count[i] * 1.0e9 / feature.length / total_hits[i] for i in xrange(n_samples) ]
            
            common_str = ' '.join(
                '%dx%s' % (item[1],item[0])
                for item in sorted(common.items(), key=lambda item:item[1], reverse=True)
            ) 
            ambiguous_str = ' '.join(
                '%dx%s' % (item[1],item[0])
                for item in sorted(ambiguous.items(), key=lambda item:item[1], reverse=True)
            ) 
            
            print >> f, tab_encode(
                [ feature_name ] +
                [ str(item) for item in count ] +
                [ '%.2f' % item for item in rpkm ] +
                [ str(feature.length) ] +
                list(feature.qualifiers) +
                [ common_str ] +
                ([ ambiguous_str ] if expect_multiple_alignments else [ ]) 
            )
            

    f.close()