Пример #1
0
 def reversed(self): # Reverse query sequence
     result = copy.copy(self)
     result.query_seq = bio.reverse_complement(result.query_seq)
     result.query_start, result.query_end = \
         len(self.query_seq)-result.query_end, len(self.query_seq)-result.query_start
     result.query_forward = not result.query_forward
     return result
Пример #2
0
 def get_interpeak_seq(peaks):
     start = min(item.transcription_stop for item in peaks)
     end = max(item.transcription_stop for item in peaks)
     if end-start > self.max_seq: return ''
     if peaks[0].strand >= 0:
         return chromosomes[peaks[0].seqid][start:end]
     else:
         return bio.reverse_complement(chromosomes[peaks[0].seqid][start:end])
Пример #3
0
 def get_interpeak_seq(peaks):
     start = min(item.transcription_stop for item in peaks)
     end = max(item.transcription_stop for item in peaks)
     if end - start > self.max_seq: return ''
     if peaks[0].strand >= 0:
         return chromosomes[peaks[0].seqid][start:end]
     else:
         return bio.reverse_complement(
             chromosomes[peaks[0].seqid][start:end])
Пример #4
0
def expected_depth(name, seq, depths, ambig_depths):
    med = numpy.median(depths)
    sane = numpy.arange(len(depths))[ (depths > med*0.5) & (depths < med*2.0) & (depths*2.0 >= ambig_depths)]
    #print 'median', med, 'using', len(sane)
    
    if sum(sane) < 100:
        warn('Skipping depth correction on ' + name)
        return numpy.array( [numpy.average(depths)] * len(depths) ) 
    
    buckets = { }
    
    radius = 2 # examine 5-mers
    n = radius*2+1
    
    sseqq = seq[len(seq)-radius:] + seq + seq[:radius]
    for i in sane:
        s = sseqq[i:i+n]
        if s not in buckets: buckets[s] = [ ]
        buckets[s].append( depths[i] )
    
    # Pool with reverse complement
    new_buckets = { }
    for kmer in buckets:
        rc = bio.reverse_complement(kmer)
        new_buckets[kmer] = buckets[kmer] + buckets.get(rc,[])
    buckets = new_buckets
    
    for key in buckets:
        buckets[key] = numpy.average(buckets[key])
    
    prediction = numpy.zeros(len(seq), 'float')
    for i in xrange(len(seq)):
        s = sseqq[i:i+n]
        prediction[i] = buckets.get(s,0.0)
    
    
    # selection of radii from 8 to 4096
    # TODO: make this configurable, or perhaps just larger
    radii = [ int(2**(0.5*i)) for i in xrange(3*2,12*2+1) ]
    
    prediction_windower = windower(prediction, radii[-1]) 
    
    a = numpy.arange(len(seq)) / float(len(seq))
    predictors = numpy.transpose(
    [   numpy.ones(len(seq), 'float'),
        numpy.cos(a * (2.0*numpy.pi)), 
        numpy.sin(a * (2.0*numpy.pi)),
    ] + [
        use_windower(prediction_windower, radius)
        for radius in radii
    ]
    )
    
    x = linalg.lstsq(predictors[sane], depths[sane])[0]
    #print x
    prediction = numpy.sum(predictors * x[None,:], 1)
    return prediction
Пример #5
0
 def get_prepeak_seq(gene,peaks):
     if gene.strand >= 0:
         start = gene.utr_pos
         end = min(item.transcription_stop for item in peaks)
         if end-start > self.max_seq: return ''
         return chromosomes[gene.seqid][start:end]
     else:
         start = max(item.transcription_stop for item in peaks)
         end = gene.utr_pos
         if end-start > self.max_seq: return ''
         return bio.reverse_complement(chromosomes[gene.seqid][start:end])
Пример #6
0
def expected_depth(name, seq, depths, ambig_depths):
    med = numpy.median(depths)
    sane = numpy.arange(
        len(depths))[(depths > med * 0.5) & (depths < med * 2.0) &
                     (depths * 2.0 >= ambig_depths)]
    #print 'median', med, 'using', len(sane)

    if sum(sane) < 100:
        warn('Skipping depth correction on ' + name)
        return numpy.array([numpy.average(depths)] * len(depths))

    buckets = {}

    radius = 2  # examine 5-mers
    n = radius * 2 + 1

    sseqq = seq[len(seq) - radius:] + seq + seq[:radius]
    for i in sane:
        s = sseqq[i:i + n]
        if s not in buckets: buckets[s] = []
        buckets[s].append(depths[i])

    # Pool with reverse complement
    new_buckets = {}
    for kmer in buckets:
        rc = bio.reverse_complement(kmer)
        new_buckets[kmer] = buckets[kmer] + buckets.get(rc, [])
    buckets = new_buckets

    for key in buckets:
        buckets[key] = numpy.average(buckets[key])

    prediction = numpy.zeros(len(seq), 'float')
    for i in xrange(len(seq)):
        s = sseqq[i:i + n]
        prediction[i] = buckets.get(s, 0.0)

    # selection of radii from 8 to 4096
    # TODO: make this configurable, or perhaps just larger
    radii = [int(2**(0.5 * i)) for i in xrange(3 * 2, 12 * 2 + 1)]

    prediction_windower = windower(prediction, radii[-1])

    a = numpy.arange(len(seq)) / float(len(seq))
    predictors = numpy.transpose([
        numpy.ones(len(seq), 'float'),
        numpy.cos(a * (2.0 * numpy.pi)),
        numpy.sin(a * (2.0 * numpy.pi)),
    ] + [use_windower(prediction_windower, radius) for radius in radii])

    x = linalg.lstsq(predictors[sane], depths[sane])[0]
    #print x
    prediction = numpy.sum(predictors * x[None, :], 1)
    return prediction
Пример #7
0
 def get_prepeak_seq(gene, peaks):
     if gene.strand >= 0:
         start = gene.utr_pos
         end = min(item.transcription_stop for item in peaks)
         if end - start > self.max_seq: return ''
         return chromosomes[gene.seqid][start:end]
     else:
         start = max(item.transcription_stop for item in peaks)
         end = gene.utr_pos
         if end - start > self.max_seq: return ''
         return bio.reverse_complement(
             chromosomes[gene.seqid][start:end])
 def get_seq(self, seq_dict):
     seq = seq_dict[self.seqid]
     if self.end <= 0 or self.start >= len(seq):
         extract = 'N' * (self.end-self.start)
     else:
         extract = seq[max(self.start,0):min(self.end,len(seq))]
         if self.start < 0:
             extract = 'N' * -self.start + extract
         if self.end > len(seq):
             extract = extract + 'N' * (self.end-len(seq))
     if self.strand < 0:
         extract = bio.reverse_complement(extract)
     return extract
Пример #9
0
 def get_seq(self, seq_dict):
     seq = seq_dict[self.seqid]
     if self.end <= 0 or self.start >= len(seq):
         extract = 'N' * (self.end - self.start)
     else:
         extract = seq[max(self.start, 0):min(self.end, len(seq))]
         if self.start < 0:
             extract = 'N' * -self.start + extract
         if self.end > len(seq):
             extract = extract + 'N' * (self.end - len(seq))
     if self.strand < 0:
         extract = bio.reverse_complement(extract)
     return extract
Пример #10
0
def pastiche(args):
    if len(args) < 4:
        print USAGE
        return 1

    mask_only, args = grace.get_option_value(args, '--mask', grace.as_bool,
                                             False)
    min_leftover, args = grace.get_option_value(args, '--min-leftover', int,
                                                20)

    output_dir, args = args[0], args[1:]

    #, ref_filename, contig_filenames = args[0], args[1], args[2:]

    ref_filenames = []
    contig_filenames = []
    grace.execute(args,
                  {'contigs': lambda args: contig_filenames.extend(args)},
                  lambda args: ref_filenames.extend(args))

    assert ref_filenames, 'No reference sequences given'
    assert contig_filenames, 'No contig sequences given'

    contigs = dict([(name.split()[0], seq) for filename in contig_filenames
                    for name, seq in io.read_sequences(filename)])
    dir_contigs = {}
    for name in contigs:
        dir_contigs[name + '+'] = contigs[name]
        dir_contigs[name + '-'] = bio.reverse_complement(contigs[name])

    dir_contigs_used = {}
    for name in dir_contigs:
        dir_contigs_used[name] = [False] * len(dir_contigs[name])

    workspace = io.Workspace(output_dir)
    temp_prefix = workspace._object_filename('temp-pastiche')

    out_f = workspace.open('pastiche.fa', 'wb')

    for ref_filename in ref_filenames:
        for ref_name, ref_seq in io.read_sequences(ref_filename):
            ref_name = ref_name.split()[0]

            grace.status(ref_name)

            f = open(temp_prefix + '.fa', 'wb')
            io.write_fasta(f, 'ref', ref_seq)
            f.close()

            scores = [-1] * (len(ref_seq) * 2)
            strings = ['N', ''] * (len(ref_seq))
            contexts = [None for i in xrange(len(ref_seq) * 2)]

            #MAXSCORE = len(ref_seq)+1
            #for i in xrange(len(ref_seq)):
            #    if ref_seq[i].upper() != 'N':
            #        strings[i*2] = ref_seq[i]
            #        scores[i*2] = MAXSCORE
            #for i in xrange(len(ref_seq)-1):
            #    if ref_seq[i].upper() != 'N' and ref_seq[i+1].upper() != 'N':
            #        scores[i*2+1] = MAXSCORE

            if mask_only:
                for i in xrange(len(ref_seq)):
                    strings[i * 2] = ref_seq[i].lower()

            def put(position, dir_contig_name, start, end, score):
                if scores[position] < score:
                    scores[position] = score
                    strings[position] = dir_contigs[dir_contig_name][start:end]
                    contexts[position] = (dir_contig_name, start, end, score)

            for contig_filename in contig_filenames:
                execute([
                    'nucmer',
                    '--prefix',
                    temp_prefix,
                    #'--maxmatch', #Very slow
                    '--nosimplify',
                    '--minmatch',
                    '9',
                    '--mincluster',
                    '50',
                    #'--maxgap', '1000',
                    #'--breaklen', '1000', # Increasing this reduces Ns, but is slow
                    #'--diagfactor', '1.0',
                    temp_prefix + '.fa',
                    contig_filename
                ])

                for contig_name, contig_seq in io.read_sequences(
                        contig_filename):
                    contig_name = contig_name.split()[0]
                    grace.status(ref_name + ' vs ' + contig_name)
                    p = run([
                        'show-aligns', temp_prefix + '.delta', 'ref',
                        contig_name
                    ],
                            stderr=subprocess.PIPE)

                    alignments = []

                    while True:
                        line = p.stdout.readline()
                        if not line: break
                        if not line.startswith('-- BEGIN'):
                            continue

                        parts = line.split()

                        ref_start = int(parts[5])
                        ref_end = int(parts[7])
                        query_start = int(parts[10])
                        query_end = int(parts[12])

                        #assert ref_start < ref_end
                        #ref_start -= 1 #Zero based coordinates

                        al_ref = []
                        al_query = []

                        while True:
                            block = []
                            end = False
                            while True:
                                line = p.stdout.readline()
                                if line.startswith('--   END'):
                                    end = True
                                    break
                                if line == '\n':
                                    if block:
                                        break
                                    else:
                                        continue
                                block.append(line)

                            if end: break

                            al_ref.append(block[0].split()[1])
                            al_query.append(block[1].split()[1])

                        al_ref = ''.join(al_ref)
                        al_query = ''.join(al_query)

                        if ref_start > ref_end:
                            al_ref = bio.reverse_complement(al_ref)
                            al_query = bio.reverse_complement(al_query)
                            ref_start, ref_end = ref_end, ref_start
                            query_start, query_end = query_end, query_start

                        if query_start > query_end:
                            dir_contig_name = contig_name + '-'
                            query_start = len(contig_seq) + 1 - query_start
                            query_end = len(contig_seq) + 1 - query_end
                        else:
                            dir_contig_name = contig_name + '+'

                        ref_start -= 1  #Zero based coordinates
                        query_start -= 1

                        #print al_ref
                        #print al_query

                        #Pretty dumb scoring scheme
                        al_score = 0
                        for i in xrange(len(al_ref)):
                            if al_ref[i] == al_query[i]:
                                al_score += 1
                            #else:
                            #    al_score -= 1

                        #Pastiche alignment over reference
                        ref_pos = ref_start
                        query_pos = query_start
                        al_pos = 0
                        while al_pos < len(al_ref):
                            assert al_ref[al_pos] != '.'
                            if al_query[al_pos] == '.':
                                put(ref_pos * 2, dir_contig_name, query_pos,
                                    query_pos, al_score)
                            else:
                                assert al_query[al_pos].lower() == dir_contigs[
                                    dir_contig_name][query_pos].lower()
                                put(ref_pos * 2, dir_contig_name, query_pos,
                                    query_pos + 1, al_score)
                                query_pos += 1
                            al_pos += 1

                            al_pos_end = al_pos
                            query_pos_end = query_pos
                            while al_pos_end < len(
                                    al_ref) and al_ref[al_pos_end] == '.':
                                al_pos_end += 1
                                query_pos_end += 1
                            #put(ref_pos*2+1, al_query[al_pos:al_pos_end], al_score)
                            assert al_query[al_pos:al_pos_end].lower(
                            ) == dir_contigs[dir_contig_name][
                                query_pos:query_pos_end].lower()
                            put(ref_pos * 2 + 1, dir_contig_name, query_pos,
                                query_pos_end, al_score)
                            al_pos = al_pos_end
                            query_pos = query_pos_end
                            ref_pos += 1

                    p.wait()

            grace.status(ref_name)

            result = ''.join(strings)
            io.write_fasta(out_f, ref_name, result)

            for context in contexts:
                if context is None: continue
                name, start, end, score = context
                for i in xrange(start, end):
                    dir_contigs_used[name][i] = True

            #Interpolation
            #result = [ ]
            #i = 0
            #while i < len(ref_seq):
            #    if strings[i*2].upper() != 'N':
            #        result.append(strings[i*2])
            #        result.append(strings[i*2+1])
            #        i += 1
            #        continue
            #
            #    j = i
            #    while strings[j*2].upper() == 'N':
            #        j += 1
            #
            #    grace.status('')
            #    print >> sys.stderr, 'interpolating', i+1,'..',j
            #
            #    window = 20 #!!!!!!!!!!!
            #    left_contexts = collections.defaultdict(lambda:0)
            #    for i1 in xrange(max(0,i-window),i):
            #        for context_name, context_start, context_end, context_score in contexts[i1*2]:
            #            key = (context_name, context_end + i - i1)
            #            left_contexts[key] = max(left_contexts[key],context_score)
            #
            #    right_contexts = collections.defaultdict(lambda:0)
            #    for j1 in xrange(j,min(j+window,len(ref_seq))):
            #        for context_name, context_start, context_end, context_score in contexts[j1*2]:
            #            key = (context_name, context_start + j - j1)
            #            right_contexts[key] = max(left_contexts[key],context_score)
            #
            #    #print >> sys.stderr, left_contexts
            #    #print >> sys.stderr, right_contexts
            #
            #    options = [ ]
            #
            #    for (left_name, left_pos), left_score in left_contexts.items():
            #        for (right_name, right_pos), right_score in right_contexts.items():
            #            if left_name != right_name: continue
            #            if right_pos < left_pos: continue
            #
            #            if right_pos-left_pos > (j-i) * 4.0 + 10: continue   #!!!!!!!!!!!!!!!!!!!!!!1
            #            if right_pos-left_pos < (j-i) * 0.25 - 10: continue
            #
            #            score = float(min(right_pos-left_pos,j-i))/max(right_pos-left_pos,j-i)
            #            score *= left_score + right_score
            #            #print >> sys.stderr, left_name, right_pos-left_pos, j-i, score
            #            options.append( (score, left_name, left_pos, right_pos) )
            #
            #    if options:
            #        best = max(options, key=lambda option: option[0])
            #        print >> sys.stderr, '->', best
            #        result.append( dir_contigs[best[1]][best[2]:best[3]].lower() )
            #    else:
            #        print >> sys.stderr, '-> no good interpolation'
            #        result.append( ref_seq[i:j] )
            #
            #    i = j
            #
            #result = ''.join(result)
            #io.write_fasta(sys.stdout, ref_name, result)

            #print >> sys.stderr, len(result), result.count('N')
            #for pos, size in N_runs:
            #    out_size = len(''.join( strings[pos*2:pos*2+2] ))
            #    print >> sys.stderr, pos, size, '->', out_size

    out_f.close()

    grace.status('')

    #for name, seq in io.read_sequences(ref_filename):
    #    result = pastiche(seq, contigs_filename)
    #    io.write_fasta(sys.stdout, name, result)

    leftover_f = workspace.open('leftovers.fa', 'wb')

    for name in sorted(contigs):
        used = [
            (a or b)
            for a, b in zip(dir_contigs_used[name +
                                             '+'], dir_contigs_used[name +
                                                                    '-'][::-1])
        ]

        i = 0
        while i < len(used):
            j = i
            while j < len(used) and not used[j]:
                j += 1
            if j - i > min_leftover:
                if i == 0 and j == len(used):
                    out_name = name
                else:
                    out_name = name + ':%d..%d' % (i + 1, j)
                io.write_fasta(leftover_f, out_name, contigs[name][i:j])

            i = j + 1

    leftover_f.close()

    for suffix in ['.fa', '.delta']:
        os.unlink(temp_prefix + suffix)
Пример #11
0
def expected_depth(name, seq, depths, ambig_depths, radius=2):
    import numpy
    from numpy import linalg

    med = numpy.median(depths)
    sane = numpy.arange(len(depths))[ (depths > med*0.5) & (depths < med*2.0) & (depths*2.0 >= ambig_depths)]
    #print 'median', med, 'using', len(sane)
    
    if sum(sane) < 100:
        warn('Skipping depth correction on ' + name)
        return numpy.array( [numpy.average(depths)] * len(depths) ) 
    
    buckets = { }
    
    #radius = 2 # examine 5-mers
    n = radius*2+1
    
    sseqq = seq[len(seq)-radius:] + seq + seq[:radius]
    for i in sane:
        s = sseqq[i:i+n]
        if s not in buckets: buckets[s] = [ ]
        buckets[s].append( depths[i] )
    
    # Pool with reverse complement
    new_buckets = { }
    for kmer in buckets:
        rc = bio.reverse_complement(kmer)
        pool = buckets[kmer] + buckets.get(rc,[])
        new_buckets[kmer] = pool 
    buckets = new_buckets
    
    buckets_individual = buckets.copy()    
    for key in buckets:
        buckets[key] = numpy.average(buckets[key])
        
    
    avg_depth = numpy.average(depths[sane])
    listing = [ (key, numpy.log(value)-numpy.log(avg_depth) ) for key,value in buckets.items()
                 if key <= bio.reverse_complement(key) ]
    listing.sort(key=lambda x: abs(x[1]), reverse=True)
    print 'Top k-mer log2 fold change'
    for key,value in listing[:10]:
        print key, '% .2f' % (value / numpy.log(2.0)), '(%d)' % len(buckets_individual[key])
    print
    
    prediction = numpy.zeros(len(seq), 'float')
    for i in xrange(len(seq)):
        s = sseqq[i:i+n]
        prediction[i] = buckets.get(s,0.0)
    
    
    # selection of radii from 8 to 4096
    # TODO: make this configurable, or perhaps just larger
    radii = [ int(2**(0.5*i)) for i in xrange(3*2,12*2+1) ]
    
    prediction_windower = windower(numpy.log(prediction), radii[-1]) 
    
    a = numpy.arange(len(seq)) / float(len(seq))
    predictors = numpy.transpose(
    [   numpy.ones(len(seq), 'float'),
        numpy.cos(a * (2.0*numpy.pi)), 
        numpy.sin(a * (2.0*numpy.pi)),
    ] + [
        use_windower(prediction_windower, radius)
        for radius in radii
    ]
    )
    
    x = linalg.lstsq(predictors[sane], numpy.log(depths[sane]))[0]
    #print x
    prediction = numpy.sum(predictors * x[None,:], 1)
    
    change = numpy.median( numpy.abs( numpy.log(depths) - prediction ) )
    print 'Median log2 fold error:', change / numpy.log(2.0)
    print
    print
    
    return numpy.exp( prediction )
Пример #12
0
def expected_depth(name, seq, depths, ambig_depths, radius=2):
    import numpy
    from numpy import linalg

    med = numpy.median(depths)
    sane = numpy.arange(
        len(depths))[(depths > med * 0.5) & (depths < med * 2.0) &
                     (depths * 2.0 >= ambig_depths)]
    #print 'median', med, 'using', len(sane)

    if sum(sane) < 100:
        warn('Skipping depth correction on ' + name)
        return numpy.array([numpy.average(depths)] * len(depths))

    buckets = {}

    #radius = 2 # examine 5-mers
    n = radius * 2 + 1

    sseqq = seq[len(seq) - radius:] + seq + seq[:radius]
    for i in sane:
        s = sseqq[i:i + n]
        if s not in buckets: buckets[s] = []
        buckets[s].append(depths[i])

    # Pool with reverse complement
    new_buckets = {}
    for kmer in buckets:
        rc = bio.reverse_complement(kmer)
        pool = buckets[kmer] + buckets.get(rc, [])
        new_buckets[kmer] = pool
    buckets = new_buckets

    buckets_individual = buckets.copy()
    for key in buckets:
        buckets[key] = numpy.average(buckets[key])

    avg_depth = numpy.average(depths[sane])
    listing = [(key, numpy.log(value) - numpy.log(avg_depth))
               for key, value in buckets.items()
               if key <= bio.reverse_complement(key)]
    listing.sort(key=lambda x: abs(x[1]), reverse=True)
    print 'Top k-mer log2 fold change'
    for key, value in listing[:10]:
        print key, '% .2f' % (value / numpy.log(2.0)), '(%d)' % len(
            buckets_individual[key])
    print

    prediction = numpy.zeros(len(seq), 'float')
    for i in xrange(len(seq)):
        s = sseqq[i:i + n]
        prediction[i] = buckets.get(s, 0.0)

    # selection of radii from 8 to 4096
    # TODO: make this configurable, or perhaps just larger
    radii = [int(2**(0.5 * i)) for i in xrange(3 * 2, 12 * 2 + 1)]

    prediction_windower = windower(numpy.log(prediction), radii[-1])

    a = numpy.arange(len(seq)) / float(len(seq))
    predictors = numpy.transpose([
        numpy.ones(len(seq), 'float'),
        numpy.cos(a * (2.0 * numpy.pi)),
        numpy.sin(a * (2.0 * numpy.pi)),
    ] + [use_windower(prediction_windower, radius) for radius in radii])

    x = linalg.lstsq(predictors[sane], numpy.log(depths[sane]))[0]
    #print x
    prediction = numpy.sum(predictors * x[None, :], 1)

    change = numpy.median(numpy.abs(numpy.log(depths) - prediction))
    print 'Median log2 fold error:', change / numpy.log(2.0)
    print
    print

    return numpy.exp(prediction)
Пример #13
0
    def run(self):
        seqs = env.load_ref(self.reference).seqs

        result = []
        errors = []
        with open(self.csv_file, "rU") as f:
            reader = csv.reader(f)
            headings = reader.next()
            headings = [item.lower() for item in headings]
            assert "id" in headings
            assert "primer" in headings
            id_col = headings.index("id")
            primer_col = headings.index("primer")

            for row in reader:
                if len(row) == 0 or (not row[id_col].strip()
                                     and not row[primer_col].strip()):
                    continue

                id = row[id_col].strip()
                assert " " not in id, "ID contains space: " + id
                primer = row[primer_col].strip().upper()
                assert len(primer) > self.skip, "Primer too short: " + id
                assert [char in "ACGT"
                        for char in primer], "Primer not ACGT: " + id
                primer = primer[self.skip:]
                rprimer = bio.reverse_complement(primer)

                hits = []
                for seq_name in seqs:
                    for match in re.finditer(primer, seqs[seq_name],
                                             re.IGNORECASE):
                        hits.append((seq_name, 1, match.start(),
                                     match.start() + self.length))
                    for match in re.finditer(rprimer, seqs[seq_name],
                                             re.IGNORECASE):
                        hits.append((seq_name, -1, match.end() - self.length,
                                     match.end()))
                    if len(hits) > 100:
                        raise config.Error("Many many hits for " + id + ".")

                if not hits:
                    errors.append("No hits for " + id + ".")
                    continue

                if len(hits) > 1:
                    self.log.log("Warning: %d hits for %s.\n" %
                                 (len(hits), id))

                for i, hit in enumerate(hits):
                    hit_name = id
                    if len(hits) > 1:
                        hit_name += "-%dof%d" % (i + 1, len(hits))
                    result.append(
                        annotation.Annotation(seqid=hit[0],
                                              source="tail-tools",
                                              type="region",
                                              start=hit[2],
                                              end=hit[3],
                                              strand=hit[1],
                                              attr=dict(ID=hit_name,
                                                        Primer=primer)))

        if errors:
            raise config.Error("\n".join(errors))

        annotation.write_gff3(self.prefix + ".gff", result)
    def run(self):
        seqs = env.load_ref(self.reference).seqs
        
        result = [ ]
        errors = [ ]
        with open(self.csv_file, "rU") as f:
            reader = csv.reader(f)
            headings = reader.next()
            headings = [ item.lower() for item in headings ]
            assert "id" in headings
            assert "primer" in headings
            id_col = headings.index("id")
            primer_col = headings.index("primer")
            
            for row in reader:
                if len(row) == 0 or (not row[id_col].strip() and not row[primer_col].strip()):
                    continue
                
                id = row[id_col].strip()
                assert " " not in id, "ID contains space: "+id
                primer = row[primer_col].strip().upper()
                assert len(primer) > self.skip, "Primer too short: "+id
                assert [ char in "ACGT" for char in primer ], "Primer not ACGT: "+id
                primer = primer[self.skip:]
                rprimer = bio.reverse_complement(primer)
                
                hits = [ ]
                for seq_name in seqs:
                    for match in re.finditer( 
                            primer, seqs[seq_name], re.IGNORECASE):
                        hits.append( (seq_name, 1, match.start(), match.start()+self.length) )
                    for match in re.finditer(
                            rprimer, seqs[seq_name], re.IGNORECASE):
                        hits.append( (seq_name, -1, match.end()-self.length, match.end()) )
                    if len(hits) > 100:
                        raise config.Error("Many many hits for "+id+".")
                
                if not hits:
                    errors.append("No hits for "+id+".")
                    continue

                if len(hits) > 1:
                    self.log.log("Warning: %d hits for %s.\n" % (len(hits),id))
                    
                for i, hit in enumerate(hits):
                    hit_name = id
                    if len(hits) > 1:
                        hit_name += "-%dof%d" % (i+1,len(hits))
                    result.append(annotation.Annotation(
                        seqid = hit[0],
                        source = "tail-tools",
                        type = "region",
                        start = hit[2],
                        end = hit[3],
                        strand = hit[1],
                        attr = dict(
                            ID=hit_name,
                            Primer=primer
                            )
                        ))
        
        if errors:
            raise config.Error("\n".join(errors))
        
        annotation.write_gff3(self.prefix+".gff", result)
        
        
        
        
        
Пример #15
0
    def run(self):
        workspace = self.get_workspace()
        
        read_length = 100
        left = rand_seq(read_length-1)
        while True:
            flank = rand_seq(1)
            if flank != self.ref[:1]: break
        left += flank
        
        right = rand_seq(read_length-1)
        while True:
            flank = rand_seq(1)
            if flank != self.ref[-1:]: break
        right = flank+right
        
        i = 0
        
        variants_used = [ ]
        
        with open(workspace/'reads.fq','wb') as f:
            for i, variant in enumerate(self.variants):
                if 'x' in variant:
                    variant, count = variant.split('x')
                    count = int(count)
                else:
                    count = 10
                variants_used.append( (variant,count) )
                seq = left+variant+right
                for j in xrange(count):
                    pos = len(variant)+random.randrange(read_length-len(variant))
                    read = seq[pos:pos+read_length]
                    if random.randrange(2):
                        read = bio.reverse_complement(read)
                    i += 1
                    io.write_fastq(f,'read_%s_%d' % (variant,i),read,chr(64+30)*len(read))

        reference = left+self.ref+right
        primary_variant = left+variants_used[0][0]+right

        with open(workspace/'reference.fa','wb') as f:
            io.write_fasta(f,'chr1',reference)
        
        legion.remake_needed()
        
        self.analysis(
            workspace/'sample',
            workspace/'reference.fa',
            reads = [ workspace/'reads.fq' ],
            ).run()
        
        self.freebayes(
            workspace/'freebayes',
            workspace/'sample',
            ).run()
        
        self.vcf_filter(
            workspace/'filtered',
            workspace/'freebayes.vcf',
            ).run()
        
        Vcf_patch(
            workspace/'patch',
            workspace/('sample','reference'),
            workspace/'filtered.vcf'
            ).run()
        
        patched = io.read_sequences(workspace/('patch','sample.fa')).next()[1]
        
        masked = io.read_sequences(workspace/('sample','consensus_masked.fa')).next()[1].upper()
        
        with open(workspace/'freebayes.vcf','rU') as f:
            reader = vcf.Reader(f)
            raw_count = len(list(reader))
        
        with open(workspace/'filtered.vcf','rU') as f:
            reader =  vcf.Reader(f)
            filtered_count = len(list(vcf.Reader(open(workspace/'filtered.vcf','rU'))))
        
        with open(workspace/('sample','report.txt'),'rb') as f:
            nesoni_count = len(f.readlines()) - 1

        self.log.log('\n')
        self.log.datum(workspace.name,'changes found by "nesoni consensus:"', nesoni_count)
        self.log.datum(workspace.name,'is correctly patched by "nesoni consensus:"', masked == primary_variant)
        self.log.log('\n')
        self.log.datum(workspace.name,'raw variants', raw_count)
        self.log.datum(workspace.name,'variants after filtering', filtered_count)
        self.log.datum(workspace.name,'is correctly patched by VCF pipeline', patched == primary_variant)
        self.log.log('\n')
Пример #16
0
def fill_scaffolds(args):
    max_filler_length, args = grace.get_option_value(args, '--max-filler', int, 4000)
    
    if len(args) < 2:
        print USAGE
        return 1
    
    (output_dir, graph_dir), args = args[:2], args[2:]

    scaffolds = [ ]
    
    def scaffold(args):
        circular, args = grace.get_option_value(args, '--circular', grace.as_bool, False)
        
        scaffold = [ ]
        for item in args:
            scaffold.append( ('contig', int(item)) )
            scaffold.append( ('gap', None) )
        
        if not circular: scaffold = scaffold[:-1]
        
        name = 'custom_scaffold_%d' % (len(scaffolds)+1)
        scaffolds.append( (name, scaffold) )
            
    grace.execute(args, [scaffold])
    
    custom_scaffolds = (len(scaffolds) != 0)    
    
    sequences = dict( 
        (a.split()[0], b.upper()) 
          for a,b in 
            io.read_sequences(os.path.join(
              graph_dir, '454AllContigs.fna')))
    
    sequence_names = sorted(sequences)
    sequence_ids = dict(zip(sequence_names, xrange(1,len(sequence_names)+1)))
    
    contexts = { }
    context_names = { }
    context_depths = { }
    for i in xrange(1,len(sequence_names)+1):
        seq = sequences[sequence_names[i-1]]
        contexts[ i ] = seq
        context_names[ i ] = sequence_names[i-1]+'-fwd'
        contexts[ -i ] = bio.reverse_complement(seq)
        context_names[ -i ] = sequence_names[i-1]+'-rev'
    
    links = collections.defaultdict(list)
    
    for line in open(
      os.path.join(graph_dir, '454ContigGraph.txt'),
      'rU'):
        parts = line.rstrip('\n').split('\t')
        
        if parts[0].isdigit():
            seq = sequence_ids[parts[1]]
            context_depths[ seq] = float(parts[3])
            context_depths[-seq] = float(parts[3])
        
        if parts[0] == 'C':    
            name1 = 'contig%05d' % int(parts[1])
            dir1 = {"3'" : 1, "5'" : -1 }[parts[2]]
            name2 = 'contig%05d' % int(parts[3])
            dir2 = {"5'" : 1, "3'" : -1 }[parts[4]]
            depth = int(parts[5])
            #print name1, dir1, name2, dir2, depth
            
            links[ sequence_ids[name1] * dir1 ].append( (depth, sequence_ids[name2] * dir2) )
            links[ sequence_ids[name2] * -dir2 ].append( (depth, sequence_ids[name1] * -dir1) )
    
        if parts[0] == 'S' and not custom_scaffolds:  
            name = 'scaffold%05d' % int(parts[2])  
            components = parts[3].split(';')
            scaffold = [ ]
            for component in components:
                a,b = component.split(':')
                if a == 'gap':
                    scaffold.append( ('gap',int(b)) )
                else:
                    strand = { '+': +1, '-': -1 }[ b ]
                    scaffold.append( ('contig', sequence_ids['contig%05d'%int(a)] * strand) )
            scaffolds.append( (name, scaffold) )
    
    
    
    #paths = { }
    #
    #todo = [ ]
    #for i in contexts:
    #    for depth_left, neg_left in links[-i]:
    #        left = -neg_left
    #        for depth_right, right in links[i]:
    #            todo.append( ( max(-depth_left,-depth_right,-context_depths[i]), left, right, (i,)) )
    #
    #heapq.heapify(todo)
    #while todo:
    #    score, source, dest, path = heapq.heappop(todo)
    #    if (source,dest) in paths: continue
    #    
    #    paths[(source,dest)] = path
    #    
    #    if len(contexts[dest]) > max_filler_length: continue
    #    
    #    for depth, next in links[dest]:
    #        heapq.heappush(todo,
    #            ( max(score,-depth,-context_depths[dest]), source, next, path+(dest,))
    #        )
    
    
    path_source_dest = collections.defaultdict(dict) # source -> dest -> next
    path_dest_source = collections.defaultdict(dict) # dest -> source -> next
    
    
    # Use links, in order to depth of coverage, to construct paths between contigs
    # Thus: paths have maximum minimum depth
    #       subsections of paths also have this property
    
    todo = [ ]
    for i in contexts:    
        for depth_link, right in links[i]:
            todo.append( ( depth_link, i, right) )
    todo.sort(reverse=True)
    for score, left, right in todo:
        if right in path_source_dest[left]: continue
        
        sources = [(left,right)]
        if len(contexts[left]) <= max_filler_length:
            sources += path_dest_source[left].items()
        destinations = [right]
        if len(contexts[right]) <= max_filler_length:
            destinations += path_source_dest[right].keys()
        
        for source, next in sources:
            for dest in destinations:
                if dest in path_source_dest[source]: continue
                path_source_dest[source][dest] = next
                path_dest_source[dest][source] = next
    
    
    workspace = io.Workspace(output_dir)
    scaffold_f = workspace.open('scaffolds.fa','wb')
    
    #comments = [ ]
    features = [ ]
    
    used = set()
    previous_total = 0
    
    for i, (name, scaffold) in enumerate(scaffolds):
        result = '' # Inefficient. Meh.
        n_filled = 0
        n_failed = 0
        for j, item in enumerate(scaffold):
            if item[0] == 'contig':
                result += contexts[item[1]]
                used.add(abs(item[1]))
            else:
                left = scaffold[j-1]
                right = scaffold[ (j+1) % len(scaffold) ] #If gap at end, assume circular
                assert left[0] == 'contig'
                assert right[0] == 'contig'
                
                gap_start = len(result)
    
                can_fill = right[1] in path_source_dest[left[1]]
                if can_fill:
                    n = 0
                    k = path_source_dest[left[1]][right[1]]
                    while k != right[1]:
                        n += len(contexts[k])
                        result += contexts[k].lower()
                        used.add(abs(k))
                        
                        k = path_source_dest[k][right[1]]
                    
                    n_filled += 1
                        
                    if item[1] is not None and max(n,item[1]) > min(n,item[1])*4:
                        print >> sys.stderr, 'Warning: gap size changed from %d to %d in scaffold %d' % (item[1],n,i+1)
                else:
                    n_failed += 1
                    
                    #print >> sys.stderr, 'Warning: No path to fill a gap in scaffold %d' % (i+1)
                    result += 'n' * (9 if item[1] is None else item[1])
    
                gap_end = len(result)
                
                #features.append( '%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s' % (
                #    'all-scaffolds',
                #    'fill-scaffolds',
                #    'gap',
                #    previous_total + gap_start+1,
                #    previous_total + max(gap_end, gap_start+1), #Allow for zeroed out gaps. Hmm.
                #    '.', #score
                #    '+', #strand
                #    '.', #frame
                #    '' #properties
                #))
                features.append( '%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s' % (
                    name,
                    'fill-scaffolds',
                    'gap',
                    gap_start+1,
                    max(gap_end, gap_start+1), #Allow for zeroed out gaps. Hmm.
                    '.', #score
                    '+', #strand
                    '.', #frame
                    '' #properties
                ))
                    
    
        io.write_fasta(scaffold_f, name, result)
        previous_total += len(result)
        #comments.append('##sequence-region    %s %d %d' % (name, 1, len(result)))
        print >> sys.stderr, 'Scaffold%05d: %d gaps filled, %d could not be filled' % (i+1, n_filled, n_failed)
    
    scaffold_f.close()
    
    gff_f = workspace.open('scaffolds.gff', 'wb')
    #print >>gff_f, '##gff-version    3'
    #for comment in comments:
    #    print >>gff_f, comment
    for feature in features:
        print >>gff_f, feature
    gff_f.close()
    
    
    leftovers_f = workspace.open('leftovers.fa', 'wb')
    for name in sequence_names:
        if sequence_ids[name] not in used:
            io.write_fasta(leftovers_f, name, sequences[name])
    leftovers_f.close()
    
    ends = { }
    for i, (name, scaffold) in enumerate(scaffolds):
        if scaffold[-1][0] == 'gap': continue
        ends[ '%s start' % name ] = scaffold[-1][1]
        ends[ '%s end  ' % name ] = -scaffold[0][1] 
    
    for end1 in sorted(ends):
        options = [ end2 for end2 in ends if -ends[end2] in path_source_dest[ends[end1]] ]
        if len(options) == 1:
            print >> sys.stderr, 'Note: from', end1, 'only', options[0], 'is reachable'
Пример #17
0
    def run(self):
        workspace = self.get_workspace()

        read_length = 100
        left = rand_seq(read_length - 1)
        while True:
            flank = rand_seq(1)
            if flank != self.ref[:1]: break
        left += flank

        right = rand_seq(read_length - 1)
        while True:
            flank = rand_seq(1)
            if flank != self.ref[-1:]: break
        right = flank + right

        i = 0

        variants_used = []

        with open(workspace / 'reads.fq', 'wb') as f:
            for i, variant in enumerate(self.variants):
                if 'x' in variant:
                    variant, count = variant.split('x')
                    count = int(count)
                else:
                    count = 10
                variants_used.append((variant, count))
                seq = left + variant + right
                for j in xrange(count):
                    pos = len(variant) + random.randrange(read_length -
                                                          len(variant))
                    read = seq[pos:pos + read_length]
                    if random.randrange(2):
                        read = bio.reverse_complement(read)
                    i += 1
                    io.write_fastq(f, 'read_%s_%d' % (variant, i), read,
                                   chr(64 + 30) * len(read))

        reference = left + self.ref + right
        primary_variant = left + variants_used[0][0] + right

        with open(workspace / 'reference.fa', 'wb') as f:
            io.write_fasta(f, 'chr1', reference)

        legion.remake_needed()

        self.analysis(
            workspace / 'sample',
            workspace / 'reference.fa',
            reads=[workspace / 'reads.fq'],
        ).run()

        self.freebayes(
            workspace / 'freebayes',
            workspace / 'sample',
        ).run()

        self.vcf_filter(
            workspace / 'filtered',
            workspace / 'freebayes.vcf',
        ).run()

        Vcf_patch(workspace / 'patch', workspace / ('sample', 'reference'),
                  workspace / 'filtered.vcf').run()

        patched = io.read_sequences(workspace /
                                    ('patch', 'sample.fa')).next()[1]

        masked = io.read_sequences(
            workspace / ('sample', 'consensus_masked.fa')).next()[1].upper()

        with open(workspace / 'freebayes.vcf', 'rU') as f:
            reader = vcf.Reader(f)
            raw_count = len(list(reader))

        with open(workspace / 'filtered.vcf', 'rU') as f:
            reader = vcf.Reader(f)
            filtered_count = len(
                list(vcf.Reader(open(workspace / 'filtered.vcf', 'rU'))))

        with open(workspace / ('sample', 'report.txt'), 'rb') as f:
            nesoni_count = len(f.readlines()) - 1

        self.log.log('\n')
        self.log.datum(workspace.name, 'changes found by "nesoni consensus:"',
                       nesoni_count)
        self.log.datum(workspace.name,
                       'is correctly patched by "nesoni consensus:"',
                       masked == primary_variant)
        self.log.log('\n')
        self.log.datum(workspace.name, 'raw variants', raw_count)
        self.log.datum(workspace.name, 'variants after filtering',
                       filtered_count)
        self.log.datum(workspace.name, 'is correctly patched by VCF pipeline',
                       patched == primary_variant)
        self.log.log('\n')
Пример #18
0
    def run(self):
        log = self.log
        
        #quality_cutoff, args = grace.get_option_value(args, '--quality', int, 10)
        #qoffset, args = grace.get_option_value(args, '--qoffset', int, None)
        #clip_ambiguous, args = grace.get_option_value(args, '--clip-ambiguous', grace.as_bool, True)
        #length_cutoff, args = grace.get_option_value(args, '--length', int, 24)
        #adaptor_cutoff, args = grace.get_option_value(args, '--match', int, 10)
        #max_error, args = grace.get_option_value(args, '--max-errors', int, 1)
        #adaptor_set, args = grace.get_option_value(args, '--adaptors', str, 'truseq-adapter,truseq-srna,genomic,multiplexing,pe,srna')
        #disallow_homopolymers, args = grace.get_option_value(args, '--homopolymers', grace.as_bool, False)
        #reverse_complement, args = grace.get_option_value(args, '--revcom', grace.as_bool, False)
        #trim_start, args = grace.get_option_value(args, '--trim-start', int, 0)
        #trim_end, args = grace.get_option_value(args, '--trim-end', int, 0)
        #output_fasta, args = grace.get_option_value(args, '--fasta', grace.as_bool, False)
        #use_gzip, args = grace.get_option_value(args, '--gzip', grace.as_bool, True)
        #output_rejects, args = grace.get_option_value(args, '--rejects', grace.as_bool, False)
        #grace.expect_no_further_options(args)
        
        prefix = self.prefix
        log_name = os.path.split(prefix)[1]
        
        quality_cutoff = self.quality
        qoffset = self.qoffset
        clip_ambiguous = self.clip_ambiguous
        length_cutoff = self.length
        adaptor_cutoff = self.match
        max_error = self.max_errors
        disallow_homopolymers = self.homopolymers
        reverse_complement = self.revcom
        trim_start = self.trim_start
        trim_end = self.trim_end
        output_fasta = self.fasta
        use_gzip = self.gzip
        output_rejects = self.rejects
    
        iterators = [ ]        
        filenames = [ ]
        any_paired = False
        
        for filename in self.reads:
            filenames.append(filename)
            iterators.append(itertools.izip(
                 io.read_sequences(filename, qualities=True)
            ))
        
        for pair_filenames in self.pairs:
            assert len(pair_filenames) == 2, 'Expected a pair of files for "pairs" section.'
            filenames.extend(pair_filenames)
            any_paired = True
            iterators.append(itertools.izip(
                io.read_sequences(pair_filenames[0], qualities=True),
                io.read_sequences(pair_filenames[1], qualities=True)
            ))
        
        for filename in self.interleaved:
            filenames.append(filename)
            any_paired = True
            iterators.append(deinterleave(
                io.read_sequences(filename, qualities=True)
            ))
        
        fragment_reads = (2 if any_paired else 1)
        read_in_fragment_names = [ 'read-1', 'read-2' ] if any_paired else [ 'read' ]
        
        assert iterators, 'Nothing to clip'
        
        io.check_name_uniqueness(self.reads, self.pairs, self.interleaved)
    
        if qoffset is None:
            guesses = [ io.guess_quality_offset(filename) for filename in filenames ]
            assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify manually.'
            qoffset = guesses[0]
            log.log('FASTQ offset seems to be %d\n' % qoffset)    
    
        quality_cutoff_char = chr(qoffset + quality_cutoff)
        
        #log.log('Minimum quality:        %d (%s)\n' % (quality_cutoff, quality_cutoff_char))
        #log.log('Clip ambiguous bases:   %s\n' % (grace.describe_bool(clip_ambiguous)))
        #log.log('Minimum adaptor match:  %d bases, %d errors\n' % (adaptor_cutoff, max_error))
        #log.log('Minimum length:         %d bases\n' % length_cutoff)
        
        adaptor_seqs = [ ]
        adaptor_names = [ ]
        if self.adaptor_clip:
            if self.adaptor_file:
                adaptor_iter = io.read_sequences(self.adaptor_file)
            else:
                adaptor_iter = ADAPTORS
            for name, seq in adaptor_iter:
                seq = seq.upper().replace('U','T')
                adaptor_seqs.append(seq)
                adaptor_names.append(name)
                adaptor_seqs.append(bio.reverse_complement(seq))
                adaptor_names.append(name)

        matcher = Matcher(adaptor_seqs, adaptor_names, max_error)
        
        start_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ]
        end_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ]
    
        if output_fasta:
            write_sequence = io.write_fasta_single_line
        else:
            write_sequence = io.write_fastq
    
        f_single = io.open_possibly_compressed_writer(self.reads_output_filenames()[0])
        if fragment_reads == 2:
            names = self.pairs_output_filenames()[0] if self.out_separate else self.interleaved_output_filenames()
            f_paired = map(io.open_possibly_compressed_writer, names)
        if output_rejects:
            f_reject = io.open_possibly_compressed_writer(self.rejects_output_filenames()[0])
        
        n_single = 0
        n_paired = 0
        
        n_in_single = 0
        n_in_paired = 0
        total_in_length = [ 0 ] * fragment_reads
        
        n_out = [ 0 ] * fragment_reads
        n_q_clipped = [ 0 ] * fragment_reads
        n_a_clipped = [ 0 ] * fragment_reads
        n_homopolymers = [ 0 ] * fragment_reads
        total_out_length = [ 0 ] * fragment_reads
        
        #log.attach(open(prefix + '_log.txt', 'wb'))
        
        for iterator in iterators:
          for fragment in iterator:
            if (n_in_single+n_in_paired) % 10000 == 0:
                grace.status('Clipping fragment %s' % grace.pretty_number(n_in_single+n_in_paired))
        
            if len(fragment) == 1:
                n_in_single += 1
            else:
                n_in_paired += 1
            
            graduates = [ ]
            rejects = [ ]
            for i, (name, seq, qual) in enumerate(fragment):
                seq = seq.upper()
                total_in_length[i] += len(seq)
                
                if self.trim_to:
                    seq = seq[:self.trim_to]
                    qual = qual[:self.trim_to]
                
                start = trim_start
                best_start = 0
                best_len = 0
                for j in xrange(len(seq)-trim_end):
                    if qual[j] < quality_cutoff_char or \
                       (clip_ambiguous and seq[j] not in 'ACGT'):
                        if best_len < j-start:
                            best_start = start
                            best_len = j-start
                        start = j + 1
                j = len(seq)-trim_end
                if best_len < j-start:
                    best_start = start
                    best_len = j-start
        
                clipped_seq = seq[best_start:best_start+best_len]
                clipped_qual = qual[best_start:best_start+best_len]
                if len(clipped_seq) < length_cutoff:
                    n_q_clipped[i] += 1
                    rejects.append( (name,seq,qual,'quality') ) 
                    continue
        
                match = matcher.match(clipped_seq)
                if match and match[0] >= adaptor_cutoff:
                    clipped_seq = clipped_seq[match[0]:]
                    clipped_qual = clipped_qual[match[0]:]
                    start_clips[i][match[0]].append( match[1][0] )
                    if len(clipped_seq) < length_cutoff:
                        n_a_clipped[i] += 1 
                        rejects.append( (name,seq,qual,'adaptor') ) 
                        continue
            
                match = matcher.match(bio.reverse_complement(clipped_seq))
                if match and match[0] >= adaptor_cutoff:
                    clipped_seq = clipped_seq[: len(clipped_seq)-match[0] ]    
                    clipped_qual = clipped_qual[: len(clipped_qual)-match[0] ]    
                    end_clips[i][match[0]].append( match[1][0] )
                    if len(clipped_seq) < length_cutoff:
                        n_a_clipped[i] += 1 
                        rejects.append( (name,seq,qual,'adaptor') ) 
                        continue
    
                if disallow_homopolymers and len(set(clipped_seq)) <= 1:
                    n_homopolymers[i] += 1
                    rejects.append( (name,seq,qual,'homopolymer') ) 
                    continue
        
                graduates.append( (name, clipped_seq, clipped_qual) )
                n_out[i] += 1
                total_out_length[i] += len(clipped_seq)
    
            if output_rejects:
                for name,seq,qual,reason in rejects:
                    write_sequence(f_reject, name + ' ' + reason, seq, qual)
             
            if graduates:
                if reverse_complement:
                    graduates = [
                        (name, bio.reverse_complement(seq), qual[::-1])
                        for name, seq, qual in graduates
                    ]
            
                if len(graduates) == 1:
                    n_single += 1

                    (name, seq, qual) = graduates[0]
                    write_sequence(f_single, name, seq, qual)
                else:
                    assert len(graduates) == 2
                    n_paired += 1

                    # Write the pair to an interleaved file or separate l/r files
                    for (lr,(name, seq, qual)) in enumerate(graduates):
                        write_sequence(f_paired[lr%len(f_paired)], name, seq, qual)
                
        
        grace.status('')
        
        if output_rejects:
            f_reject.close()
        if fragment_reads == 2:
            map(lambda f: f.close(), f_paired)
        f_single.close()
        
        def summarize_clips(name, location, clips):
            total = 0
            for i in clips:
                total += len(clips[i])
            log.datum(log_name, name + ' adaptors clipped at ' + location, total) 
            
            if not clips:
                return
    
            for i in xrange(min(clips), max(clips)+1):
                item = clips[i]
                log.quietly_log('%3d bases: %10d ' % (i, len(item)))
                if item:
                    avg_errors = float(sum( item2[0] for item2 in item )) / len(item)
                    log.quietly_log(' avg errors: %5.2f  ' % avg_errors)
                    
                    counts = collections.defaultdict(int)
                    for item2 in item: counts[item2[1]] += 1
                    #print counts
                    for no in sorted(counts,key=lambda item2:counts[item2],reverse=True)[:2]:
                        log.quietly_log('%dx%s ' % (counts[no], matcher.names[no]))
                    if len(counts) > 2: log.quietly_log('...')
                    
                log.quietly_log('\n')
            log.quietly_log('\n')


        if n_in_paired:
            log.datum(log_name,'read-pairs', n_in_paired)                      
        if n_in_single:
            log.datum(log_name,'single reads', n_in_single)                      
        
        for i in xrange(fragment_reads):
            if start_clips:
                summarize_clips(read_in_fragment_names[i], 'start', start_clips[i])
        
            if end_clips:
                summarize_clips(read_in_fragment_names[i], 'end', end_clips[i])

                prefix = read_in_fragment_names[i]
                
            log.datum(log_name, prefix + ' too short after quality clip', n_q_clipped[i])
            log.datum(log_name, prefix + ' too short after adaptor clip', n_a_clipped[i])
            if disallow_homopolymers:
                log.datum(log_name, prefix + ' homopolymers', n_homopolymers[i])
            if fragment_reads > 1:
                log.datum(log_name, prefix + ' kept', n_out[i])
            log.datum(log_name, prefix + ' average input length',  float(total_in_length[i]) / (n_in_single+n_in_paired))                     
            if n_out[i]:
                log.datum(log_name, prefix + ' average output length', float(total_out_length[i]) / n_out[i])                     
        
        if fragment_reads == 2:
            log.datum(log_name,'pairs kept after clipping', n_paired)                      
        log.datum(log_name, 'reads kept after clipping', n_single)
    def run(self):
        adaptor = self.adaptor.upper()
        name = self.name or os.path.basename(self.prefix)
        
        headers = sam.bam_headers(self.input)
        
        writer = sam.Bam_writer(self.prefix+"_temp.bam", headers)
        
        n_kept = 0
        n_unaligned = 0
        n_discarded = 0
        n_multi = 0

        for i, al in enumerate(sam.Bam_reader(self.input)):
            if al.flag & sam.FLAG_UNMAPPED:
                writer.write(al)
                n_unaligned += 1
                continue
            
            reverse = al.flag & sam.FLAG_REVERSE
            if reverse:
                read_bases = bio.reverse_complement(al.seq)
                cigar = cigar_parts(al.cigar)[::-1]
            else:
                read_bases = al.seq.upper()
                cigar = cigar_parts(al.cigar)
            
            n_unaligned = 0
            if cigar and cigar[-1][1] == "S": 
                n_unaligned = cigar[-1][0]
            
            n_aligned = len(read_bases) - n_unaligned
            seq_unaligned = read_bases[n_aligned:]
            seq_aligned = read_bases[:n_aligned]
            AN, AD = a_adaptor_count(seq_unaligned, adaptor)
            AG = a_count(seq_aligned[::-1])
            
            if AN: al.extra.append("AN:i:%d" % AN)
            if AD: al.extra.append("AD:i:%d" % AN)
            if AG: al.extra.append("AG:i:%d" % AN)
            if AN >= 4: al.extra.append("AA:i:1")
            
            if n_aligned - AG < self.min_genomic:
                al.flag = al.flag | sam.FLAG_UNMAPPED
                n_discarded += 1
            else:
                n_kept += 1

                NH = 1
                for item in al.extra:
                    if item.startswith("NH:i:"):
                        NH = int(item[5:])
                if NH > 1: n_multi += 1
            
            if i % 10000 == 0:
                print al.rname, al.pos
                print cigar
                print " "*(len(seq_aligned)-AG)+"="*AG
                print seq_aligned#[::-1]
                print seq_unaligned
                print "="*AN+"D"*AD
            
            writer.write(al)

        self.log.datum(name, "reads", n_unaligned+n_kept+n_discarded)
        self.log.datum(name, "did not align", n_unaligned)
        self.log.datum(name, "short non-A alignments discarded", n_discarded)
        self.log.datum(name, "alignments kept", n_kept)
        self.log.datum(name, "multimappers", n_multi)

        writer.close()        
        sam.sort_and_index_bam(
            self.prefix+"_temp.bam",
            self.prefix)
        os.unlink(self.prefix+"_temp.bam")
Пример #20
0
    def run(self):
        log = self.log

        #quality_cutoff, args = grace.get_option_value(args, '--quality', int, 10)
        #qoffset, args = grace.get_option_value(args, '--qoffset', int, None)
        #clip_ambiguous, args = grace.get_option_value(args, '--clip-ambiguous', grace.as_bool, True)
        #length_cutoff, args = grace.get_option_value(args, '--length', int, 24)
        #adaptor_cutoff, args = grace.get_option_value(args, '--match', int, 10)
        #max_error, args = grace.get_option_value(args, '--max-errors', int, 1)
        #adaptor_set, args = grace.get_option_value(args, '--adaptors', str, 'truseq-adapter,truseq-srna,genomic,multiplexing,pe,srna')
        #disallow_homopolymers, args = grace.get_option_value(args, '--homopolymers', grace.as_bool, False)
        #reverse_complement, args = grace.get_option_value(args, '--revcom', grace.as_bool, False)
        #trim_start, args = grace.get_option_value(args, '--trim-start', int, 0)
        #trim_end, args = grace.get_option_value(args, '--trim-end', int, 0)
        #output_fasta, args = grace.get_option_value(args, '--fasta', grace.as_bool, False)
        #use_gzip, args = grace.get_option_value(args, '--gzip', grace.as_bool, True)
        #output_rejects, args = grace.get_option_value(args, '--rejects', grace.as_bool, False)
        #grace.expect_no_further_options(args)

        prefix = self.prefix
        log_name = os.path.split(prefix)[1]

        quality_cutoff = self.quality
        qoffset = self.qoffset
        clip_ambiguous = self.clip_ambiguous
        length_cutoff = self.length
        adaptor_cutoff = self.match
        max_error = self.max_errors
        adaptor_set = self.adaptors
        disallow_homopolymers = self.homopolymers
        reverse_complement = self.revcom
        trim_start = self.trim_start
        trim_end = self.trim_end
        output_fasta = self.fasta
        use_gzip = self.gzip
        output_rejects = self.rejects

        iterators = []
        filenames = []
        any_paired = False

        for filename in self.reads:
            filenames.append(filename)
            iterators.append(
                itertools.izip(io.read_sequences(filename, qualities=True)))

        for pair_filenames in self.pairs:
            assert len(pair_filenames
                       ) == 2, 'Expected a pair of files for "pairs" section.'
            filenames.extend(pair_filenames)
            any_paired = True
            iterators.append(
                itertools.izip(
                    io.read_sequences(pair_filenames[0], qualities=True),
                    io.read_sequences(pair_filenames[1], qualities=True)))

        for filename in self.interleaved:
            filenames.extend(filename)
            any_paired = True
            iterators.append(
                deinterleave(io.read_sequences(filename, qualities=True)))

        fragment_reads = (2 if any_paired else 1)
        read_in_fragment_names = ['read-1', 'read-2'
                                  ] if any_paired else ['read']

        assert iterators, 'Nothing to clip'

        if qoffset is None:
            guesses = [
                io.guess_quality_offset(filename) for filename in filenames
            ]
            assert len(
                set(guesses)
            ) == 1, 'Conflicting quality offset guesses, please specify manually.'
            qoffset = guesses[0]
            log.log('FASTQ offset seems to be %d\n' % qoffset)

        quality_cutoff_char = chr(qoffset + quality_cutoff)

        #log.log('Minimum quality:        %d (%s)\n' % (quality_cutoff, quality_cutoff_char))
        #log.log('Clip ambiguous bases:   %s\n' % (grace.describe_bool(clip_ambiguous)))
        #log.log('Minimum adaptor match:  %d bases, %d errors\n' % (adaptor_cutoff, max_error))
        #log.log('Minimum length:         %d bases\n' % length_cutoff)

        adaptor_seqs = []
        adaptor_names = []
        if adaptor_set and adaptor_set.lower() != 'none':
            for item in adaptor_set.split(','):
                item = item.strip().lower() + ' '
                any = False
                for line in ADAPTORS.strip().split('\n'):
                    if line.startswith('#'): continue
                    if not line.lower().startswith(item): continue
                    any = True
                    name, seq = line.rsplit(None, 1)
                    seq = seq.replace('U', 'T')

                    #if seq in adaptor_seqs: print 'Dup', name
                    adaptor_seqs.append(seq)
                    adaptor_names.append(name)
                    adaptor_seqs.append(bio.reverse_complement(seq))
                    adaptor_names.append(name)
                if not any:
                    raise grace.Error('Unknown adaptor set: ' + item)

        matcher = Matcher(adaptor_seqs, adaptor_names, max_error)

        start_clips = [
            collections.defaultdict(list) for i in xrange(fragment_reads)
        ]
        end_clips = [
            collections.defaultdict(list) for i in xrange(fragment_reads)
        ]

        if output_fasta:
            write_sequence = io.write_fasta_single_line
        else:
            write_sequence = io.write_fastq

        f_single = io.open_possibly_compressed_writer(
            self.reads_output_filenames()[0])
        if fragment_reads == 2:
            f_paired = io.open_possibly_compressed_writer(
                self.interleaved_output_filenames()[0])
        if output_rejects:
            f_reject = io.open_possibly_compressed_writer(
                self.rejects_output_filenames()[0])

        n_single = 0
        n_paired = 0

        n_in_single = 0
        n_in_paired = 0
        total_in_length = [0] * fragment_reads

        n_out = [0] * fragment_reads
        n_q_clipped = [0] * fragment_reads
        n_a_clipped = [0] * fragment_reads
        n_homopolymers = [0] * fragment_reads
        total_out_length = [0] * fragment_reads

        #log.attach(open(prefix + '_log.txt', 'wb'))

        for iterator in iterators:
            for fragment in iterator:
                if (n_in_single + n_in_paired) % 10000 == 0:
                    grace.status(
                        'Clipping fragment %s' %
                        grace.pretty_number(n_in_single + n_in_paired))

                if len(fragment) == 1:
                    n_in_single += 1
                else:
                    n_in_paired += 1

                graduates = []
                rejects = []
                for i, (name, seq, qual) in enumerate(fragment):
                    name = name.split()[0]
                    seq = seq.upper()
                    total_in_length[i] += len(seq)

                    start = trim_start
                    best_start = 0
                    best_len = 0
                    for j in xrange(len(seq) - trim_end):
                        if qual[j] < quality_cutoff_char or \
                           (clip_ambiguous and seq[j] not in 'ACGT'):
                            if best_len < j - start:
                                best_start = start
                                best_len = j - start
                            start = j + 1
                    j = len(seq) - trim_end
                    if best_len < j - start:
                        best_start = start
                        best_len = j - start

                    clipped_seq = seq[best_start:best_start + best_len]
                    clipped_qual = qual[best_start:best_start + best_len]
                    if len(clipped_seq) < length_cutoff:
                        n_q_clipped[i] += 1
                        rejects.append((name, seq, qual, 'quality'))
                        continue

                    match = matcher.match(clipped_seq)
                    if match and match[0] >= adaptor_cutoff:
                        clipped_seq = clipped_seq[match[0]:]
                        clipped_qual = clipped_qual[match[0]:]
                        start_clips[i][match[0]].append(match[1][0])
                        if len(clipped_seq) < length_cutoff:
                            n_a_clipped[i] += 1
                            rejects.append((name, seq, qual, 'adaptor'))
                            continue

                    match = matcher.match(bio.reverse_complement(clipped_seq))
                    if match and match[0] >= adaptor_cutoff:
                        clipped_seq = clipped_seq[:len(clipped_seq) - match[0]]
                        clipped_qual = clipped_qual[:len(clipped_qual) -
                                                    match[0]]
                        end_clips[i][match[0]].append(match[1][0])
                        if len(clipped_seq) < length_cutoff:
                            n_a_clipped[i] += 1
                            rejects.append((name, seq, qual, 'adaptor'))
                            continue

                    if disallow_homopolymers and len(set(clipped_seq)) <= 1:
                        n_homopolymers[i] += 1
                        rejects.append((name, seq, qual, 'homopolymer'))
                        continue

                    graduates.append((name, clipped_seq, clipped_qual))
                    n_out[i] += 1
                    total_out_length[i] += len(clipped_seq)

                if output_rejects:
                    for name, seq, qual, reason in rejects:
                        write_sequence(f_reject, name + ' ' + reason, seq,
                                       qual)

                if graduates:
                    if reverse_complement:
                        graduates = [(name, bio.reverse_complement(seq),
                                      qual[::-1])
                                     for name, seq, qual in graduates]

                    if len(graduates) == 1:
                        this_f = f_single
                        n_single += 1
                    else:
                        assert len(graduates) == 2
                        this_f = f_paired
                        n_paired += 1

                    for name, seq, qual in graduates:
                        write_sequence(this_f, name, seq, qual)

        grace.status('')

        if output_rejects:
            f_reject.close()
        if fragment_reads == 2:
            f_paired.close()
        f_single.close()

        def summarize_clips(name, location, clips):
            total = 0
            for i in clips:
                total += len(clips[i])
            log.datum(log_name, name + ' adaptors clipped at ' + location,
                      total)

            if not clips:
                return

            for i in xrange(min(clips), max(clips) + 1):
                item = clips[i]
                log.quietly_log('%3d bases: %10d ' % (i, len(item)))
                if item:
                    avg_errors = float(sum(item2[0]
                                           for item2 in item)) / len(item)
                    log.quietly_log(' avg errors: %5.2f  ' % avg_errors)

                    counts = collections.defaultdict(int)
                    for item2 in item:
                        counts[item2[1]] += 1
                    #print counts
                    for no in sorted(counts,
                                     key=lambda item2: counts[item2],
                                     reverse=True)[:2]:
                        log.quietly_log('%dx%s ' %
                                        (counts[no], matcher.names[no]))
                    if len(counts) > 2: log.quietly_log('...')

                log.quietly_log('\n')
            log.quietly_log('\n')

        if n_in_paired:
            log.datum(log_name, 'read-pairs', n_in_paired)
        if n_in_single:
            log.datum(log_name, 'single reads', n_in_single)

        for i in xrange(fragment_reads):
            if start_clips:
                summarize_clips(read_in_fragment_names[i], 'start',
                                start_clips[i])

            if end_clips:
                summarize_clips(read_in_fragment_names[i], 'end', end_clips[i])

                prefix = read_in_fragment_names[i]

            log.datum(log_name, prefix + ' too short after quality clip',
                      n_q_clipped[i])
            log.datum(log_name, prefix + ' too short after adaptor clip',
                      n_a_clipped[i])
            if disallow_homopolymers:
                log.datum(log_name, prefix + ' homopolymers',
                          n_homopolymers[i])
            if fragment_reads > 1:
                log.datum(log_name, prefix + ' kept', n_out[i])
            log.datum(log_name, prefix + ' average input length',
                      float(total_in_length[i]) / (n_in_single + n_in_paired))
            if n_out[i]:
                log.datum(log_name, prefix + ' average output length',
                          float(total_out_length[i]) / n_out[i])

        if fragment_reads == 2:
            log.datum(log_name, 'pairs kept after clipping', n_paired)
        log.datum(log_name, 'reads kept after clipping', n_single)
Пример #21
0
def pastiche(args):
    if len(args) < 4:
        print USAGE
        return 1

    mask_only, args = grace.get_option_value(args, '--mask', grace.as_bool, False)
    min_leftover, args = grace.get_option_value(args, '--min-leftover', int, 20)
        
    output_dir, args = args[0], args[1:]
    
    #, ref_filename, contig_filenames = args[0], args[1], args[2:]
    
    ref_filenames = [ ]
    contig_filenames = [ ]
    grace.execute(args, {
        'contigs' : lambda args: contig_filenames.extend(args)
    }, lambda args: ref_filenames.extend(args))
    
    assert ref_filenames, 'No reference sequences given'
    assert contig_filenames, 'No contig sequences given'
    
    contigs = dict([ 
                 (name.split()[0], seq) 
                 for filename in contig_filenames 
                 for name, seq in io.read_sequences(filename) 
              ])
    dir_contigs = { }
    for name in contigs:
        dir_contigs[name + '+'] = contigs[name]
        dir_contigs[name + '-'] = bio.reverse_complement(contigs[name])
    
    dir_contigs_used = { }
    for name in dir_contigs:
        dir_contigs_used[name] = [ False ] * len(dir_contigs[name])


    workspace = io.Workspace(output_dir)
    temp_prefix = workspace._object_filename('temp-pastiche')
    
    out_f = workspace.open('pastiche.fa', 'wb')
    
    for ref_filename in ref_filenames:
      for ref_name, ref_seq in io.read_sequences(ref_filename):
        ref_name = ref_name.split()[0]
        
        grace.status(ref_name)
        
        f = open(temp_prefix + '.fa','wb')
        io.write_fasta(f, 'ref', ref_seq)
        f.close()
    
        scores = [ -1 ] * (len(ref_seq)*2)
        strings = [ 'N', '' ] * (len(ref_seq))
        contexts = [ None for i in xrange(len(ref_seq)*2) ]
        
        #MAXSCORE = len(ref_seq)+1
        #for i in xrange(len(ref_seq)):
        #    if ref_seq[i].upper() != 'N':
        #        strings[i*2] = ref_seq[i]
        #        scores[i*2] = MAXSCORE
        #for i in xrange(len(ref_seq)-1):
        #    if ref_seq[i].upper() != 'N' and ref_seq[i+1].upper() != 'N':
        #        scores[i*2+1] = MAXSCORE

        if mask_only:        
            for i in xrange(len(ref_seq)):
                strings[i*2] = ref_seq[i].lower()
        
        
        def put(position, dir_contig_name, start, end, score):
            if scores[position] < score:
                scores[position] = score
                strings[position] = dir_contigs[dir_contig_name][start:end]
                contexts[position] = (dir_contig_name, start, end, score)

        for contig_filename in contig_filenames:
            execute(['nucmer',
                     '--prefix', temp_prefix,
                     #'--maxmatch', #Very slow
                     '--nosimplify',
                     '--minmatch', '9',
                     '--mincluster', '50',
                     #'--maxgap', '1000',
                     #'--breaklen', '1000', # Increasing this reduces Ns, but is slow
                     #'--diagfactor', '1.0',
                     temp_prefix+'.fa',
                     contig_filename])
            
            for contig_name, contig_seq in io.read_sequences(contig_filename):
                contig_name = contig_name.split()[0]
                grace.status(ref_name + ' vs ' + contig_name)
                p = run(['show-aligns', temp_prefix+'.delta', 'ref', contig_name],
                        stderr=subprocess.PIPE)
                
                alignments = [ ]
                
                while True:
                    line = p.stdout.readline()
                    if not line: break
                    if not line.startswith('-- BEGIN'):
                        continue
                    
                    parts = line.split()
                    
                    ref_start = int(parts[5])
                    ref_end = int(parts[7])
                    query_start = int(parts[10])
                    query_end = int(parts[12])
                    
                    #assert ref_start < ref_end
                    #ref_start -= 1 #Zero based coordinates
                    
                    al_ref = [ ]
                    al_query = [ ]
                    
                    while True:
                        block = [ ]
                        end = False
                        while True:
                            line = p.stdout.readline()
                            if line.startswith('--   END'): 
                                end = True
                                break
                            if line == '\n':
                                if block: 
                                    break
                                else:
                                    continue
                            block.append(line)
                        
                        if end: break
                        
                        al_ref.append(block[0].split()[1])
                        al_query.append(block[1].split()[1])
                        
                    al_ref = ''.join(al_ref)
                    al_query = ''.join(al_query)            
                    
                    if ref_start > ref_end:
                       al_ref = bio.reverse_complement(al_ref)
                       al_query = bio.reverse_complement(al_query)
                       ref_start, ref_end = ref_end, ref_start
                       query_start, query_end = query_end, query_start
                    
                    if query_start > query_end:
                       dir_contig_name = contig_name + '-'
                       query_start = len(contig_seq)+1-query_start
                       query_end = len(contig_seq)+1-query_end
                    else:
                       dir_contig_name = contig_name + '+'
                       
                    ref_start -= 1 #Zero based coordinates
                    query_start -= 1
                    
                    #print al_ref
                    #print al_query
                    
                    #Pretty dumb scoring scheme
                    al_score = 0
                    for i in xrange(len(al_ref)):
                        if al_ref[i] == al_query[i]:
                            al_score += 1
                        #else:
                        #    al_score -= 1
                    
                    #Pastiche alignment over reference
                    ref_pos = ref_start
                    query_pos = query_start
                    al_pos = 0
                    while al_pos < len(al_ref):
                        assert al_ref[al_pos] != '.'                
                        if al_query[al_pos] == '.':
                            put(ref_pos*2, dir_contig_name, query_pos, query_pos, al_score)
                        else:
                            assert al_query[al_pos].lower() == dir_contigs[dir_contig_name][query_pos].lower()
                            put(ref_pos*2, dir_contig_name, query_pos, query_pos+1, al_score)
                            query_pos += 1
                        al_pos += 1
                        
                        al_pos_end = al_pos
                        query_pos_end = query_pos
                        while al_pos_end < len(al_ref) and al_ref[al_pos_end] == '.':
                            al_pos_end += 1
                            query_pos_end += 1
                        #put(ref_pos*2+1, al_query[al_pos:al_pos_end], al_score)
                        assert al_query[al_pos:al_pos_end].lower() == dir_contigs[dir_contig_name][query_pos:query_pos_end].lower() 
                        put(ref_pos*2+1, dir_contig_name, query_pos,query_pos_end, al_score)
                        al_pos = al_pos_end
                        query_pos = query_pos_end
                        ref_pos += 1
                    
                    
                p.wait()
            
        grace.status(ref_name)
        
        result = ''.join(strings)    
        io.write_fasta(out_f, ref_name, result)
        
        
        for context in contexts:
            if context is None: continue
            name,start,end,score = context
            for i in xrange(start,end):
                dir_contigs_used[name][i] = True
        
        
        #Interpolation
        #result = [ ]
        #i = 0
        #while i < len(ref_seq):
        #    if strings[i*2].upper() != 'N':
        #        result.append(strings[i*2])
        #        result.append(strings[i*2+1])
        #        i += 1
        #        continue
        #    
        #    j = i
        #    while strings[j*2].upper() == 'N':
        #        j += 1
        #    
        #    grace.status('')
        #    print >> sys.stderr, 'interpolating', i+1,'..',j
        #    
        #    window = 20 #!!!!!!!!!!!
        #    left_contexts = collections.defaultdict(lambda:0)
        #    for i1 in xrange(max(0,i-window),i):
        #        for context_name, context_start, context_end, context_score in contexts[i1*2]:
        #            key = (context_name, context_end + i - i1)
        #            left_contexts[key] = max(left_contexts[key],context_score)
        #        
        #    right_contexts = collections.defaultdict(lambda:0)
        #    for j1 in xrange(j,min(j+window,len(ref_seq))):
        #        for context_name, context_start, context_end, context_score in contexts[j1*2]:
        #            key = (context_name, context_start + j - j1)
        #            right_contexts[key] = max(left_contexts[key],context_score)
        #    
        #    #print >> sys.stderr, left_contexts
        #    #print >> sys.stderr, right_contexts
        #    
        #    options = [ ]
        #    
        #    for (left_name, left_pos), left_score in left_contexts.items():
        #        for (right_name, right_pos), right_score in right_contexts.items():
        #            if left_name != right_name: continue
        #            if right_pos < left_pos: continue
        #            
        #            if right_pos-left_pos > (j-i) * 4.0 + 10: continue   #!!!!!!!!!!!!!!!!!!!!!!1
        #            if right_pos-left_pos < (j-i) * 0.25 - 10: continue
        #            
        #            score = float(min(right_pos-left_pos,j-i))/max(right_pos-left_pos,j-i)                  
        #            score *= left_score + right_score
        #            #print >> sys.stderr, left_name, right_pos-left_pos, j-i, score
        #            options.append( (score, left_name, left_pos, right_pos) )
        #    
        #    if options:
        #        best = max(options, key=lambda option: option[0])
        #        print >> sys.stderr, '->', best
        #        result.append( dir_contigs[best[1]][best[2]:best[3]].lower() )
        #    else:
        #        print >> sys.stderr, '-> no good interpolation'
        #        result.append( ref_seq[i:j] )
        #    
        #    i = j
        #
        #result = ''.join(result)    
        #io.write_fasta(sys.stdout, ref_name, result)
        
        
        #print >> sys.stderr, len(result), result.count('N')
        #for pos, size in N_runs:
        #    out_size = len(''.join( strings[pos*2:pos*2+2] ))
        #    print >> sys.stderr, pos, size, '->', out_size        
    
    out_f.close()
    
    grace.status('')
    
    #for name, seq in io.read_sequences(ref_filename):
    #    result = pastiche(seq, contigs_filename)
    #    io.write_fasta(sys.stdout, name, result)
    
    
    leftover_f = workspace.open('leftovers.fa','wb')

    for name in sorted(contigs):
        used = [ (a or b) for a,b in zip(dir_contigs_used[name+'+'],dir_contigs_used[name+'-'][::-1]) ]

        i = 0
        while i < len(used):
            j = i
            while j < len(used) and not used[j]: 
                j += 1
            if j-i > min_leftover:
                if i == 0 and j == len(used):
                    out_name = name
                else:
                    out_name = name + ':%d..%d' % (i+1,j)
                io.write_fasta(leftover_f, out_name, contigs[name][i:j])
            
            i = j+1        

    leftover_f.close()

    for suffix in ['.fa', '.delta']:
        os.unlink(temp_prefix + suffix)