Exemplo n.º 1
0
    def _load_bam(self, filename):
        spans = { }

        if self.filter == 'poly':
            use_bam_filename = 'alignments.bam'
            use_only_top = True
            use_only_monogamous = False
            expect_multiple_alignments = True
        elif self.filter == 'mono': 
            use_bam_filename = 'alignments.bam'
            use_only_top = True
            use_only_monogamous = True
            expect_multiple_alignments = True
        else:
            assert self.filter == 'existing', 'Unrecognized filtering mode'
            use_bam_filename = 'alignments_filtered.bam'
            use_only_top = False
            use_only_monogamous = False
            expect_multiple_alignments = False
        
        if os.path.isdir(filename):
            filename = os.path.join(filename, use_bam_filename)
        
        for read_name, fragment_alignments, unmapped in \
                sam.bam_iter_fragments(
                    filename, 
                    'Scanning'):
            if not fragment_alignments:
                continue
                
            if use_only_top:
                fragment_scores = [ sum( al.get_AS() for al in item ) for item in fragment_alignments ]            
                best_score = max(fragment_scores)
                fragment_alignments = [ 
                    item 
                    for item, score in zip(fragment_alignments, fragment_scores)
                    if score >= best_score ]            
            
            for alignments in fragment_alignments:
                if self.strand_specific:
                    strand = -1 if alignments[0].flag&sam.FLAG_REVERSE else 1
                else:
                    strand = 0
            
                start = min(item.pos-1 for item in alignments)
                end = max(item.pos+item.length-1 for item in alignments)
                if end-start <= self.trim*2: continue
                
                rname = alignments[0].rname
                if (rname,strand) not in spans: 
                    spans[(rname,strand)] = [ ]          
                spans[(rname, strand)].append((start+self.trim,end-self.trim))
                
        if self.deduplicate:
            for key in spans:
                spans[key] = list(set(spans[key]))
        
        return spans
Exemplo n.º 2
0
    def _load_bam(self, filename):
        spans = { }

        if self.filter == 'poly':
            use_bam_filename = 'alignments.bam'
            use_only_top = True
            use_only_monogamous = False
            expect_multiple_alignments = True
        elif self.filter == 'mono': 
            use_bam_filename = 'alignments.bam'
            use_only_top = True
            use_only_monogamous = True
            expect_multiple_alignments = True
        else:
            assert self.filter == 'existing', 'Unrecognized filtering mode'
            use_bam_filename = 'alignments_filtered.bam'
            use_only_top = False
            use_only_monogamous = False
            expect_multiple_alignments = False
        
        if os.path.isdir(filename):
            filename = os.path.join(filename, use_bam_filename)
        
        for read_name, fragment_alignments, unmapped in \
                sam.bam_iter_fragments(
                    filename, 
                    'Scanning'):
            if not fragment_alignments:
                continue
                
            if use_only_top:
                fragment_scores = [ sum( al.get_AS() for al in item ) for item in fragment_alignments ]            
                best_score = max(fragment_scores)
                fragment_alignments = [ 
                    item 
                    for item, score in zip(fragment_alignments, fragment_scores)
                    if score >= best_score ]            
            
            for alignments in fragment_alignments:
                if self.strand_specific:
                    strand = -1 if alignments[0].flag&sam.FLAG_REVERSE else 1
                else:
                    strand = 0
            
                start = min(item.pos-1 for item in alignments)
                end = max(item.pos+item.length-1 for item in alignments)
                
                if self.what == '5prime':
                   if strand >= 0:
                       end = start+1
                   else:
                       start = end-1
                elif self.what == '3prime':
                   if strand >= 0:
                       start = end-1
                   else:
                       end = start+1
                
                if end+self.lap-start <= 0: continue
                
                rname = alignments[0].rname
                if (rname,strand) not in spans: 
                    spans[(rname,strand)] = [ ]          
                spans[(rname, strand)].append((start,end+self.lap))
                
        if self.deduplicate:
            for key in spans:
                spans[key] = list(set(spans[key]))
        
        return spans
Exemplo n.º 3
0
def count_run(min_score, min_size, max_size, filter_mode, equalize, types,
              locii, qualifiers, use_strand, merge_filename, limit,
              output_prefix, filenames, log):

    if filter_mode == 'poly':
        use_bam_filename = 'alignments.bam'
        use_only_top = True
        use_only_monogamous = False
        expect_multiple_alignments = True
    elif filter_mode == 'mono':
        use_bam_filename = 'alignments.bam'
        use_only_top = True
        use_only_monogamous = True
        expect_multiple_alignments = True
    else:
        assert filter_mode == 'existing', 'Unrecognized filtering mode'
        use_bam_filename = 'alignments_filtered.bam'
        use_only_top = False
        use_only_monogamous = False
        expect_multiple_alignments = False

    types = types.lower().split(',')

    qualifiers = qualifiers.split(',')

    if locii:
        locii = locii.lower().split(',')
    else:
        locii = None

    assert use_strand is not None, 'You must now explicitly specify --strand'
    assert use_strand in ('pool', 'forward', 'reverse',
                          'both'), "Can't understand --strand specification."

    from Bio import Seq, SeqIO

    annotation_filenames = []
    bam_filenames = []
    for arg in filenames:
        if annotation.is_annotation_file(arg):
            annotation_filenames.append(arg)
        else:
            bam_filenames.append(arg)

    n_samples = len(bam_filenames)
    titles = bam_filenames[:]
    for i in xrange(len(bam_filenames)):
        if os.path.isdir(bam_filenames[i]):
            titles[i] = os.path.basename(bam_filenames[i])
            if not annotation_filenames:
                working = working_directory.Working(bam_filenames[i])
                reference_filename = working.get_reference(
                ).annotations_filename()
                if reference_filename is not None:
                    annotation_filenames.append(reference_filename)

            bam_filenames[i] = os.path.join(bam_filenames[i], use_bam_filename)

    assert bam_filenames, 'No reference alignments given'

    merge = {}
    merge_qualifiers = {}
    if merge_filename is not None:
        #First line gives qualifiers
        #remaining lines give <qualifier> <qualifier...> <gene> <transcript> <transcript...>

        f = open(merge_filename, 'rU')
        qualifiers = f.readline().rstrip('\n').split('\t')
        for line in f:
            parts = line.rstrip('\n').split('\t')
            if not parts: continue
            for name in parts[len(qualifiers) + 1:]:
                assert name not in merge, 'Duplicate feature name in merge file'
                merge[name] = parts[len(qualifiers)]
                merge_qualifiers[name] = parts[:len(qualifiers)]
        f.close()

    genes = {}  # reference name -> gene index

    feature_names = {}  # feature_name -> number of occurrences

    features = []

    n_features = 0

    chromosome_length = {}
    for filename in bam_filenames:
        headers = sam.bam_headers(filename)
        for line in headers.split('\n'):
            if not line: continue
            parts = line.split('\t')
            if parts[0] != '@SQ': continue

            name = None
            length = None
            for part in parts[1:]:
                if part.startswith('SN:'): name = part[3:]
                if part.startswith('LN:'): length = int(part[3:])
            assert name is not None and length is not None

            if name in chromosome_length:
                assert chromosome_length[name] == length
            else:
                chromosome_length[name] = length

    for name in chromosome_length:
        genes[name] = span_index.Span_index()

    if annotation_filenames:
        assert not merge, 'Merging not supported with annotation files'

        for filename in annotation_filenames:
            for feature in annotation.read_annotations(filename):
                if feature.type.lower() not in types: continue

                if (locii is not None and
                    ('locus_tag' not in feature.attr
                     or feature.attr['locus_tag'].lower() not in locii)):
                    continue

                f = Feature(n_samples)
                f.name = feature.get_id()
                if feature.type.lower() != 'cds' and len(types) > 1:
                    f.name = feature.type + ':' + f.name

                feature_names[f.name] = feature_names.get(f.name, 0) + 1
                if feature_names[f.name] > 1:
                    f.name += '/%d' % feature_names[f.name]

                f.qualifiers = [
                    feature.attr.get(item, '') for item in qualifiers
                ]

                f.length = feature.end - feature.start

                assert feature.seqid in genes, 'Annotation for sequence that is not in BAM files'
                genes[feature.seqid].insert(
                    Span_entry(feature.start, feature.end, feature.strand or 1,
                               f))
                features.append(f)

    else:
        # Sequences as features
        log.log(
            'No annotation files given or found, using sequences as features\n'
        )

        name_feature = {}  # (merged)name -> feature

        for name in chromosome_length:
            merged_name = merge.get(name, name)

            if merged_name not in name_feature:
                f = Feature(n_samples)
                f.name = merged_name
                f.length = length
                f.qualifiers = merge_qualifiers.get(name,
                                                    ('', ) * len(qualifiers))
                n_features += 1
                name_feature[merged_name] = f
                features.append(f)
            else:
                f = name_feature[merged_name]
                f.length = max(f.length, length)  #...

            genes[name].insert(Span_entry(0, chromosome_length[name], 1, f))

    log.log('%d features\n\n' % len(features))

    for name in genes:
        genes[name].prepare()

    n_fragments = [0] * n_samples
    n_fragments_aligned = [0] * n_samples
    n_low_score = [0] * n_samples
    n_something = [0] * n_samples
    n_multiple = [0] * n_samples
    n_span = [0] * n_samples

    for i in xrange(n_samples):
        for read_name, fragment_alignments, unmapped in sam.bam_iter_fragments(
                bam_filenames[i],
                'Counting sample %d of %d' % (i + 1, n_samples)):
            n_fragments[i] += 1

            if not fragment_alignments:
                continue

            n_fragments_aligned[i] += 1

            feature_hits = []  # [ [ (feature, strand) ] ]

            # Use only top scoring alignments
            fragment_scores = [
                sum(al.get_AS() for al in item) for item in fragment_alignments
            ]

            best_score = max(fragment_scores)

            if min_score is not None and best_score < min_score:
                n_low_score[i] += 1
                continue

            if use_only_top:
                cutoff = max(best_score, min_score)
            else:
                cutoff = min_score
            fragment_alignments = [
                item
                for item, score in zip(fragment_alignments, fragment_scores)
                if score >= cutoff
            ]

            for alignments in fragment_alignments:
                strand = -1 if alignments[0].flag & sam.FLAG_REVERSE else 1

                start = min(item.pos - 1 for item in alignments)
                end = max(item.pos + item.length - 1 for item in alignments)
                length = end - start
                if min_size is not None and length < min_size: continue
                if max_size is not None and length > max_size: continue

                rname = alignments[0].rname
                strand = -1 if alignments[0].flag & sam.FLAG_REVERSE else 1
                assert alignments[
                    0].rname in genes, 'Alignment refers to sequence not present in GENBANK file'

                this_feature_hits = []
                for item in genes[rname].get(start, end):
                    rel_strand = strand * item.strand
                    key = (item.feature, rel_strand)
                    if key in this_feature_hits: continue
                    this_feature_hits.append(key)
                    if not use_only_monogamous or len(
                            fragment_alignments) == 1:
                        item.feature.count[rel_strand][i] += 1

                if this_feature_hits:
                    feature_hits.append(this_feature_hits)

                if len(this_feature_hits) > 1:
                    for a in this_feature_hits:
                        for b in this_feature_hits:
                            if a[0] is b[0]: continue
                            a[0].common[(a[1], b[1])][b[0]] += 1

            if len(feature_hits) > 0:
                n_something[i] += 1
            #else:
            #    print fragment_alignments
            #    print genes[fragment_alignments[0][0].rname].indexes
            #    print

            if len(feature_hits) > 1:
                n_multiple[i] += 1
                for j in xrange(len(feature_hits)):
                    for k in xrange(len(feature_hits)):
                        if j == k: continue
                        for a in feature_hits[j]:
                            for b in feature_hits[k]:
                                if a[0] is b[0]: continue
                                a[0].ambiguous[(a[1], b[1])][b[0]] += 1

            if any(len(item) > 1 for item in feature_hits): n_span[i] += 1

            if limit is not None and n_fragments[i] >= limit: break

        grace.status('')

        #log.log('%s\n' % titles[i])
        #log.log('%20s fragments\n' % grace.pretty_number(n_fragments[i]))
        #log.log('%20s fragments aligned to the reference\n' % grace.pretty_number(n_fragments_aligned[i]))
        #if n_low_score[i]:
        #    log.log('%20s had too low an alignment score, discarded\n' % grace.pretty_number(n_low_score[i]))
        #log.log('%20s aligned to an annotated gene\n' % grace.pretty_number(n_something[i]))
        #if expect_multiple_alignments or n_multiple[i]:
        #    log.log('%20s aligned to multiple genes\n' % grace.pretty_number(n_multiple[i]))
        #log.log('%20s had an alignment that spanned multiple genes\n' % grace.pretty_number(n_span[i]))
        #log.log('\n')

        log.datum(titles[i], 'fragments', n_fragments[i])
        log.datum(titles[i], 'fragments aligned to the reference',
                  n_fragments_aligned[i])
        if n_low_score[i]:
            log.datum(titles[i], 'had too low an alignment score, discarded',
                      n_low_score[i])
        log.datum(titles[i], 'aligned to an annotated gene', n_something[i])
        if expect_multiple_alignments or n_multiple[i]:
            log.datum(titles[i], 'aligned to multiple genes', n_multiple[i])
        log.datum(titles[i], 'had an alignment that spanned multiple genes',
                  n_span[i])
        log.log('\n')

    strandedness = []
    for feature in features:
        n_forward = sum(feature.count[1])
        n_reverse = sum(feature.count[-1])
        if n_forward + n_reverse < 5: continue
        strandedness.append(
            (n_forward - n_reverse) * 100.0 / (n_forward + n_reverse))
    strandedness = sum(strandedness) / len(strandedness)
    log.log(
        'Strand specificity: %.0f%%\n'
        '  (~ -100%% reverse strand, ~ 0%% non-specific, ~ 100%% forward strand\n'
        '   Average over all features with at least 5 hits.)\n' % strandedness)

    if use_strand == 'pool':
        getters = [
            lambda f:
            (feature.name, add_lists(feature.count[1], feature.count[-1]),
             add_defdicts(feature.common[(1, 1)], feature.common[
                 (1, -1)], feature.common[(-1, 1)], feature.common[(-1, -1)]),
             add_defdicts(feature.ambiguous[(1, 1)], feature.ambiguous[
                 (1, -1)], feature.ambiguous[(-1, 1)], feature.ambiguous[
                     (-1, -1)]))
        ]
    elif use_strand == 'forward':
        getters = [
            lambda f: (feature.name, feature.count[1], feature.common[
                (1, 1)], feature.ambiguous[(1, 1)])
        ]
    elif use_strand == 'reverse':
        getters = [
            lambda f: (feature.name, feature.count[-1], feature.common[
                (-1, -1)], feature.ambiguous[(-1, -1)])
        ]
    elif use_strand == 'both':
        getters = [
            lambda f: (feature.name, feature.count[1], feature.common[
                (1, 1)], feature.ambiguous[(1, 1)]), lambda f:
            (feature.name + 'r', feature.count[-1], feature.common[
                (-1, -1)], feature.ambiguous[(-1, -1)])
        ]

    total_hits = [0] * n_samples
    for feature in features:
        for getter in getters:
            total_hits = add_lists(total_hits, getter(feature)[1])

    if equalize:
        min_hits = min(total_hits)
        p = [float(min_hits) / item for item in total_hits]
        total_hits = [min_hits] * n_samples

    f = open(output_prefix + '.txt', 'wb')
    #log.attach(open(output_prefix + '_log.txt', 'wb'))

    print >> f, tab_encode(
        ['Feature'] + titles + ['RPKM ' + item for item in titles] +
        ['Length'] + qualifiers + ['On same fragment'] +
        (['Ambiguous alignment'] if expect_multiple_alignments else []))

    for feature in features:
        for getter in getters:
            feature_name, count, common, ambiguous = getter(feature)

            if equalize:
                count = [subsample(count[i], p[i]) for i in xrange(n_samples)]

            rpkm = [
                count[i] * 1.0e9 / feature.length / total_hits[i]
                for i in xrange(n_samples)
            ]

            common_str = ' '.join(
                '%dx%s' % (item[1], item[0]) for item in sorted(
                    common.items(), key=lambda item: item[1], reverse=True))
            ambiguous_str = ' '.join(
                '%dx%s' % (item[1], item[0]) for item in sorted(
                    ambiguous.items(), key=lambda item: item[1], reverse=True))

            print >> f, tab_encode(
                [feature_name] + [str(item) for item in count] +
                ['%.2f' % item for item in rpkm] + [str(feature.length)] +
                list(feature.qualifiers) + [common_str] +
                ([ambiguous_str] if expect_multiple_alignments else []))

    f.close()
Exemplo n.º 4
0
def count_run(
    min_score, min_size, max_size, filter_mode, equalize, types, locii,
    qualifiers, use_strand, merge_filename, limit, output_prefix, filenames, log):
    
    if filter_mode == 'poly':
        use_bam_filename = 'alignments.bam'
        use_only_top = True
        use_only_monogamous = False
        expect_multiple_alignments = True
    elif filter_mode == 'mono': 
        use_bam_filename = 'alignments.bam'
        use_only_top = True
        use_only_monogamous = True
        expect_multiple_alignments = True
    else:
        assert filter_mode == 'existing', 'Unrecognized filtering mode'
        use_bam_filename = 'alignments_filtered.bam'
        use_only_top = False
        use_only_monogamous = False
        expect_multiple_alignments = False

    types = types.lower().split(',')

    qualifiers = qualifiers.split(',')

    if locii:
        locii = locii.lower().split(',')
    else:
        locii = None

    assert use_strand is not None, 'You must now explicitly specify --strand'
    assert use_strand in ('pool','forward','reverse','both'), "Can't understand --strand specification."
    
    from Bio import Seq, SeqIO

    annotation_filenames = [ ]
    bam_filenames = [ ]
    for arg in filenames:
        if annotation.is_annotation_file(arg):
            annotation_filenames.append(arg)
        else:
            bam_filenames.append(arg)    

    n_samples = len(bam_filenames)
    titles = bam_filenames[:]    
    for i in xrange(len(bam_filenames)):
        if os.path.isdir(bam_filenames[i]):
            titles[i] = os.path.basename(bam_filenames[i])
            if not annotation_filenames:
                working = working_directory.Working(bam_filenames[i])
                reference_filename = working.get_reference().annotations_filename()
                if reference_filename is not None:
                    annotation_filenames.append(reference_filename)
            
            bam_filenames[i] = os.path.join(bam_filenames[i], use_bam_filename)
    
    assert bam_filenames, 'No reference alignments given' 

    merge = { }
    merge_qualifiers = { }
    if merge_filename is not None:
        #First line gives qualifiers
        #remaining lines give <qualifier> <qualifier...> <gene> <transcript> <transcript...>
    
        f = open(merge_filename,'rU')
        qualifiers = f.readline().rstrip('\n').split('\t')
        for line in f:
            parts = line.rstrip('\n').split('\t')
            if not parts: continue
            for name in parts[len(qualifiers)+1:]:
                assert name not in merge, 'Duplicate feature name in merge file'
                merge[name] = parts[len(qualifiers)]
                merge_qualifiers[name] = parts[:len(qualifiers)]
        f.close()

    
    genes = { }  # reference name -> gene index
    
    feature_names = { }   # feature_name -> number of occurrences
    
    features = [ ]
    
    n_features = 0

    chromosome_length = { }
    for filename in bam_filenames:
        headers = sam.bam_headers(filename)
        for line in headers.split('\n'):
            if not line: continue
            parts = line.split('\t')
            if parts[0] != '@SQ': continue
            
            name = None
            length = None
            for part in parts[1:]:
                if part.startswith('SN:'): name = part[3:]
                if part.startswith('LN:'): length = int(part[3:])
            assert name is not None and length is not None
            
            if name in chromosome_length:
                assert chromosome_length[name] == length
            else:
                chromosome_length[name] = length
    
    for name in chromosome_length:
        genes[name] = span_index.Span_index()

    
    if annotation_filenames:
        assert not merge, 'Merging not supported with annotation files'
    
        for filename in annotation_filenames:
            for feature in annotation.read_annotations(filename):
                if feature.type.lower() not in types: continue
                
                if (locii is not None and
                    ('locus_tag' not in feature.attr or
                     feature.attr['locus_tag'].lower() not in locii)):
                    continue
                
                f = Feature(n_samples)
                f.name = feature.get_id()                
                if feature.type.lower() != 'cds' and len(types) > 1:
                    f.name = feature.type + ':' + f.name

                feature_names[f.name] = feature_names.get(f.name,0)+1
                if feature_names[f.name] > 1:
                   f.name += '/%d' % feature_names[f.name]
                   
                f.qualifiers = [ feature.attr.get(item,'') for item in qualifiers ]
                
                f.length = feature.end - feature.start

                assert feature.seqid in genes, 'Annotation for sequence that is not in BAM files'
                genes[feature.seqid].insert(Span_entry(feature.start, feature.end, feature.strand or 1, f))
                features.append(f)

    else:
        # Sequences as features        
        log.log('No annotation files given or found, using sequences as features\n')
        
        name_feature = { } # (merged)name -> feature
        
        for name in chromosome_length:
            merged_name = merge.get(name, name)
            
            if merged_name not in name_feature: 
                f = Feature(n_samples)
                f.name = merged_name
                f.length = length
                f.qualifiers = merge_qualifiers.get(name, ('',)*len(qualifiers))
                n_features += 1
                name_feature[merged_name] = f
                features.append(f)
            else:
                f = name_feature[merged_name]
                f.length = max(f.length, length) #...
            
            genes[name].insert(Span_entry(0, chromosome_length[name], 1, f))
                
                

    
    log.log('%d features\n\n' % len(features))

    for name in genes: 
        genes[name].prepare()
        
    n_fragments = [ 0 ] * n_samples
    n_fragments_aligned = [ 0 ] * n_samples
    n_low_score = [ 0 ] * n_samples
    n_something = [ 0 ] * n_samples
    n_multiple = [ 0 ] * n_samples
    n_span = [ 0 ] * n_samples

    for i in xrange(n_samples):
        for read_name, fragment_alignments, unmapped in sam.bam_iter_fragments(bam_filenames[i], 'Counting sample %d of %d' % (i+1,n_samples)):
            n_fragments[i] += 1
            
            if not fragment_alignments: 
                continue
            
            n_fragments_aligned[i] += 1
            
            feature_hits = [ ] # [ [ (feature, strand) ] ]
            
            # Use only top scoring alignments
            fragment_scores = [ sum( al.get_AS() for al in item ) for item in fragment_alignments ]
            
            best_score = max(fragment_scores)
            
            if min_score is not None and best_score < min_score:
                n_low_score[i] += 1
                continue
            
            if use_only_top:
                cutoff = max(best_score, min_score)
            else:
                cutoff = min_score            
            fragment_alignments = [ item 
                                    for item, score in zip(fragment_alignments, fragment_scores)
                                    if score >= cutoff ]            
            
            for alignments in fragment_alignments:
                strand = -1 if alignments[0].flag&sam.FLAG_REVERSE else 1
            
                start = min(item.pos-1 for item in alignments)
                end = max(item.pos+item.length-1 for item in alignments)
                length = end-start
                if min_size is not None and length < min_size: continue
                if max_size is not None and length > max_size: continue
                
                rname = alignments[0].rname
                strand = -1 if alignments[0].flag&sam.FLAG_REVERSE else 1
                assert alignments[0].rname in genes, 'Alignment refers to sequence not present in GENBANK file'
            
                this_feature_hits = [ ]    
                for item in genes[rname].get(start, end):
                    rel_strand = strand * item.strand
                    key = (item.feature, rel_strand)
                    if key in this_feature_hits: continue
                    this_feature_hits.append( key )
                    if not use_only_monogamous or len(fragment_alignments) == 1:
                        item.feature.count[rel_strand][i] += 1
                
                if this_feature_hits: 
                    feature_hits.append( this_feature_hits )
                
                if len(this_feature_hits) > 1:
                    for a in this_feature_hits:
                        for b in this_feature_hits:
                            if a[0] is b[0]: continue
                            a[0].common[(a[1],b[1])][b[0]] += 1
                                
                
            if len(feature_hits) > 0: 
                n_something[i] += 1
            #else:
            #    print fragment_alignments
            #    print genes[fragment_alignments[0][0].rname].indexes
            #    print
            
            
            if len(feature_hits) > 1: 
                n_multiple[i] += 1
                for j in xrange(len(feature_hits)):
                    for k in xrange(len(feature_hits)):
                        if j == k: continue
                        for a in feature_hits[j]:
                            for b in feature_hits[k]:
                                if a[0] is b[0]: continue
                                a[0].ambiguous[(a[1],b[1])][b[0]] += 1
            
            if any(len(item) > 1 for item in feature_hits): n_span[i] += 1
            
            if limit is not None and n_fragments[i] >= limit: break

        grace.status('')

        #log.log('%s\n' % titles[i])
        #log.log('%20s fragments\n' % grace.pretty_number(n_fragments[i]))
        #log.log('%20s fragments aligned to the reference\n' % grace.pretty_number(n_fragments_aligned[i]))
        #if n_low_score[i]:
        #    log.log('%20s had too low an alignment score, discarded\n' % grace.pretty_number(n_low_score[i]))
        #log.log('%20s aligned to an annotated gene\n' % grace.pretty_number(n_something[i]))
        #if expect_multiple_alignments or n_multiple[i]:
        #    log.log('%20s aligned to multiple genes\n' % grace.pretty_number(n_multiple[i]))
        #log.log('%20s had an alignment that spanned multiple genes\n' % grace.pretty_number(n_span[i]))
        #log.log('\n')

        log.datum(titles[i], 'fragments', n_fragments[i])
        log.datum(titles[i], 'fragments aligned to the reference', n_fragments_aligned[i])
        if n_low_score[i]:
            log.datum(titles[i], 'had too low an alignment score, discarded', n_low_score[i])
        log.datum(titles[i], 'aligned to an annotated gene', n_something[i])
        if expect_multiple_alignments or n_multiple[i]:
            log.datum(titles[i], 'aligned to multiple genes', n_multiple[i])
        log.datum(titles[i],'had an alignment that spanned multiple genes', n_span[i])
        log.log('\n')

    strandedness = [ ]
    for feature in features:
        n_forward = sum(feature.count[1])
        n_reverse = sum(feature.count[-1])
        if n_forward+n_reverse < 5: continue
        strandedness.append( (n_forward-n_reverse)*100.0 / (n_forward+n_reverse) )
    strandedness = sum(strandedness) / len(strandedness)
    log.log('Strand specificity: %.0f%%\n'
            '  (~ -100%% reverse strand, ~ 0%% non-specific, ~ 100%% forward strand\n'
            '   Average over all features with at least 5 hits.)\n'
            % strandedness)


    if use_strand == 'pool':
        getters = [ lambda f: (feature.name, 
                               add_lists(feature.count[1],feature.count[-1]),
                               add_defdicts(feature.common[(1,1)], 
                                            feature.common[(1,-1)], 
                                            feature.common[(-1,1)], 
                                            feature.common[(-1,-1)]),
                               add_defdicts(feature.ambiguous[(1,1)], 
                                            feature.ambiguous[(1,-1)], 
                                            feature.ambiguous[(-1,1)], 
                                            feature.ambiguous[(-1,-1)])) ]
    elif use_strand == 'forward':
        getters = [ lambda f: (feature.name, feature.count[1], feature.common[(1,1)], feature.ambiguous[(1,1)]) ]
    elif use_strand == 'reverse':
        getters = [ lambda f: (feature.name, feature.count[-1], feature.common[(-1,-1)], feature.ambiguous[(-1,-1)]) ]
    elif use_strand == 'both':
        getters = [ lambda f: (feature.name, feature.count[1], feature.common[(1,1)], feature.ambiguous[(1,1)]),
                    lambda f: (feature.name + 'r', feature.count[-1], feature.common[(-1,-1)], feature.ambiguous[(-1,-1)]) ]

    total_hits = [0] * n_samples
    for feature in features:
        for getter in getters:
            total_hits = add_lists(total_hits, getter(feature)[1])

    if equalize:
        min_hits = min(total_hits)
        p = [ float(min_hits)/item for item in total_hits ]
        total_hits = [ min_hits ] * n_samples

    f = open(output_prefix + '.txt', 'wb')
    #log.attach(open(output_prefix + '_log.txt', 'wb'))

    print >> f, tab_encode(
        [ 'Feature' ] +
        titles +
        [ 'RPKM ' + item for item in titles ] +
        [ 'Length' ] +
        qualifiers +
        [ 'On same fragment' ] +
        ([ 'Ambiguous alignment' ] if expect_multiple_alignments else [ ])
    )
    
    for feature in features:
        for getter in getters:
            feature_name, count, common, ambiguous = getter(feature)
            
            if equalize:
                count = [
                    subsample(count[i], p[i])
                    for i in xrange(n_samples)
                ]
            
            rpkm = [ count[i] * 1.0e9 / feature.length / total_hits[i] for i in xrange(n_samples) ]
            
            common_str = ' '.join(
                '%dx%s' % (item[1],item[0])
                for item in sorted(common.items(), key=lambda item:item[1], reverse=True)
            ) 
            ambiguous_str = ' '.join(
                '%dx%s' % (item[1],item[0])
                for item in sorted(ambiguous.items(), key=lambda item:item[1], reverse=True)
            ) 
            
            print >> f, tab_encode(
                [ feature_name ] +
                [ str(item) for item in count ] +
                [ '%.2f' % item for item in rpkm ] +
                [ str(feature.length) ] +
                list(feature.qualifiers) +
                [ common_str ] +
                ([ ambiguous_str ] if expect_multiple_alignments else [ ]) 
            )
            

    f.close()
Exemplo n.º 5
0
     def run(self):
         assert self.extension is not None, '--extension must be specified'
     
         #workspace = self.get_workspace()
         workspace = working_directory.Working(self.working_dir, must_exist=True)
         if self.annotations == None:
             reference = workspace.get_reference()
             annotations_filename = reference.annotations_filename()
         else:
             annotations_filename = self.annotations
         
         types = [ item.lower() for item in self.types.split(',') ]
     
         annotations = [ 
             item 
             for item in annotation.read_annotations(annotations_filename)
             if item.type.lower() in types
         ]
         
         self.log.log('%d annotations\n' % len(annotations))
         
         assert annotations, 'No annotations of specified types in file'
         
         index = { }
         
         for item in annotations:
             if item.strand >= 0:
                 item.tail_pos = item.end
                 item.end += self.extension
             else:
                 item.tail_pos = item.start
                 item.start -= self.extension
         
             if item.seqid not in index:
                 index[item.seqid] = span_index.Span_index()
             index[item.seqid].insert(item)
             
             item.hits = [] # [ (rel_start, rel_end, tail_length) ]
         
         for item in index.itervalues(): 
             item.prepare()
         
         for read_name, fragment_alignments, unmapped in sam.bam_iter_fragments(workspace/'alignments_filtered.bam'):
             for fragment in fragment_alignments:
                 start = min(item.pos-1 for item in fragment)
                 end = max(item.pos+item.length-1 for item in fragment)
                 alignment_length = end-start
                 strand = -1 if fragment[0].flag&sam.FLAG_REVERSE else 1
                 
                 if strand >= 0:
                     tail_pos = end
                 else:
                     tail_pos = start
                 
                 tail_length = 0
                 adaptor_bases = 0
                 for item in fragment[0].extra:
                     if item.startswith('AN:i:'):
                         tail_length = int(item[5:])
                     elif item.startswith('AD:i:'):
                         adaptor_bases = int(item[5:])
                 
                 if fragment[0].rname in index:
                     hits = [ 
                         gene
                         for gene in index[fragment[0].rname].get(start,end)
                         if gene.strand == strand
                         ]                         
                     if hits:
                         gene = min(hits, key=lambda gene: (abs(tail_pos - gene.tail_pos), gene.get_id()))
                             # Nearest by tail_pos
                             # failing that, by id to ensure a deterministic choice
                             
                         if strand > 0:
                             rel_start = start - gene.start
                             rel_end = end - gene.start
                         else:
                             rel_start = gene.end - end
                             rel_end = gene.end - start
                         
                         gene.hits.append( (rel_start,rel_end,tail_length,adaptor_bases) )

         f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz')
         pickle.dump((workspace.name, workspace.get_tags(), annotations), f, pickle.HIGHEST_PROTOCOL)
         f.close()