Exemplo n.º 1
0
def run_minimus(fasta,
                outroot=None,
                restore_singletons=True,
                contig_prefix='',
                qual=None):
    '''given a fasta file and an optional output root (otherwise use fasta base)
	
	generates an assembly using minimus from the amos package
	assembly saved as <outroot>.minimus.fasta
	optionally restores singleton reads in <outroot>.all.fasta
	if restore_singletons=True, returns path to .all.fasta, otherwise returns path to minimus.fasta'''

    if outroot is None:
        outroot = fasta.rsplit('.', 1)[0]

    if qual:
        print >> sys.stderr, 'qualities invoked (%s)' % qual
        os.system('toAmos -s %s -q %s -o %s.afg' % (fasta, qual, outroot))
    else:
        os.system('toAmos -s %s -o %s.afg' % (fasta, outroot))
    os.system('minimus -D TGT=%s.afg %s.minimus' % (outroot, outroot))

    if contig_prefix:
        lines = open(outroot + '.minimus.contig').readlines()
        fh = open(outroot + '.minimus.contig', 'w')
        for l in lines:
            if l.startswith('##'):
                print >> fh, '##' + contig_prefix + l[2:],
            else:
                print >> fh, l,
        fh.close()

    if restore_singletons:
        in_assem = re.findall('#(.+?)\(',
                              open(outroot + '.minimus.contig').read())
        reads = Seq.Fasta(fasta)
        for f in in_assem:
            del reads[f]
        all_fasta = outroot + '.all.fasta'
        assem = Seq.Fasta(outroot + '.minimus.fasta')
        allseq = Seq.Fasta()
        allseq.update(dict([(contig_prefix + k, v) for k, v in assem.items()]))
        allseq.update(reads)
        allseq.write_to_file(all_fasta)
        return all_fasta
    else:
        if contig_prefix:
            f = outroot + '.minimus.fasta'
            lines = open(f).readlines()
            fh = open(f, 'w')
            for l in lines:
                if l.startswith('>'):
                    print >> fh, '>' + contig_prefix + l[1:],
                else:
                    print >> fh, l,
            fh.close()
        return outroot + '.minimus.fasta'
Exemplo n.º 2
0
def build_fasta_from_scaff_gff(infasta_s,
                               gff,
                               contig_prefix='',
                               include_singletons=True,
                               ol_minID=0.9,
                               outfile=None,
                               mum_len='4'):
    '''takes scaffolding information from gff of the form generated by get_scaff_from_minimus
	builds a single assembly for all scaffold instructions pertaining to seqids in infasta
	
	if include_singletons is True, adds all sequences from infasta not included in scaffolds along with the scaffolded sequence in the returned assembly
	
	ol_minID is the minimum %ID accepted for overlaps in contigs
	'''

    if isinstance(infasta_s, str):
        infasta = Seq.Fasta(infasta_s)
    else:
        infasta = deepcopy(infasta_s)

    suffixes = ['', 'b', 'c', 'd', 'e', 'f']
    current_suffix = ''

    #use only scaffolding info relevant to the specified infasta
    in_ids = infasta.seq_names()
    this_gff = [r for r in gff if r['seqid'] in in_ids]

    #use only scaffolding info that joins 2 or more seqs
    contigs = {}.fromkeys([r['attribute_contig'] for r in this_gff], 0)
    for r in this_gff:
        contigs[r['attribute_contig']] += 1

    #get final ordered scaffolding layout
    this_gff = sorted(
        [r for r in this_gff if contigs[r['attribute_contig']] > 1],
        key=lambda r: (r['attribute_contig'], int(r['attribute_cstart']),
                       int(r['attribute_cend'])))

    #extract sequences and orient for scaffolding
    assem_frags = infasta.substr_from_gff(this_gff,
                                          plus_strand=True,
                                          name_key=None)

    assem = Seq.Fasta()
    for k, v in contigs.items():
        if v > 1:
            assem[contig_prefix + k] = Seq.Sequence('')

    if this_gff:
        for i, r in enumerate(this_gff[:-1]):
            next = this_gff[i + 1]
            if r['attribute_contig'] == next['attribute_contig']:
                s1, e1, s2, e2 = [
                    int(n) for n in [
                        r['attribute_cstart'], r['attribute_cend'],
                        next['attribute_cstart'], next['attribute_cend']
                    ]
                ]
                print >> sys.stderr, 'scaffolding %s %s %s %s:\n\t%s\n\t%s' % (
                    s1, e1, s2, e2, r, next)

                if e1 > s2:
                    print >> sys.stderr, 'OVERLAP:\n\t%s\n\t%s' % (
                        assem_frags[r['seqid']][(s2 - s1):],
                        assem_frags[next['seqid']][:(e1 - s2 + 1)])
                    fa1 = Seq.Fasta()
                    fa2 = Seq.Fasta()
                    fa1['seq1'] = assem_frags[r['seqid']][(s2 - s1):]
                    fa2['seq2'] = assem_frags[next['seqid']][:(e1 - s2 + 1)]
                    shorter = min(len(fa1['seq1']), len(fa2['seq2']))
                    mums = Aln.mum(fa1,
                                   fa2,
                                   mumargs={'-l': '%s' % int(mum_len)})[0]
                    match = float(sum([mumr['score'] for mumr in mums]))
                    if (shorter <= 2*int(mum_len) + math.ceil((1-ol_minID)*shorter)) or \
                      (match/shorter >= ol_minID) or \
                      (fa1['seq1'][:shorter] == fa2['seq2'][:shorter]) or \
                      (Seq.is_simple(fa1['seq1']) or Seq.is_simple(fa2['seq2'])):
                        assem[contig_prefix + r['attribute_contig'] +
                              current_suffix] += assem_frags[r['seqid']][:s2]
                    else:
                        #implement record of splitting into a/b/etc fragments!
                        print >> sys.stdout, fa1, '\n', fa2, '\n', mums
                        current_suffix = suffixes[
                            suffixes.index(current_suffix) + 1]
                        print >> sys.stderr, 'overlap of %s bp %0.2f %%id unresolved (min %0.2f)\nstarting %s' % (
                            e1 - s2, match /
                            (e1 - s2), ol_minID, current_suffix)
                        assem[contig_prefix + r['attribute_contig'] +
                              current_suffix] = assem_frags[r['seqid']]
                else:
                    spacer = Seq.Sequence('n' * (s2 - e1))
                    assem[contig_prefix + r['attribute_contig'] +
                          current_suffix] += assem_frags[r['seqid']] + spacer
            else:
                assem[contig_prefix + r['attribute_contig'] +
                      current_suffix] += assem_frags[r['seqid']]
                current_suffix = ''

        assem[contig_prefix + this_gff[-1]['attribute_contig'] +
              current_suffix] += assem_frags[this_gff[-1]['seqid']]

    if include_singletons:
        singletons = dict([(k, v) for k, v in infasta.items()
                           if not k in [r['seqid'] for r in this_gff]])
        assem.update(singletons)

    if outfile:
        assem.write_to_file(outfile)

    return assem