Пример #1
0
    def runDiamond(self, ref, qry, nhits=10, frames='7') :
        logger('Run diamond starts')

        refAA = os.path.join(self.dirPath, 'refAA')
        qryAA = os.path.join(self.dirPath, 'qryAA')
        aaMatch = os.path.join(self.dirPath, 'aaMatch')
        
        if not self.qrySeq :
            self.qrySeq, self.qryQual = readFastq(qry)
        if not self.refSeq :
            self.refSeq, self.refQual = readFastq(ref)

        qryAASeq = transeq(self.qrySeq, frame='F', transl_table=self.table_id)
        with open(qryAA, 'w') as fout :
            for n, ss in sorted(qryAASeq.items()) :
                _, id, s = min([ (len(s[:-1].split('X')), id, s) for id, s in enumerate(ss) ])
                fout.write('>{0}:{1}\n{2}\n'.format(n, id+1, s))
        
        diamond_fmt = '{diamond} makedb --db {qryAA} --in {qryAA}'.format(
            diamond=diamond, qryAA=qryAA)
        p = Popen(diamond_fmt.split(), stderr=PIPE, stdout=PIPE, universal_newlines=True).communicate()
        
        refAASeq = transeq(self.refSeq, frames, transl_table=self.table_id)
        toWrite = []
        for n, ss in sorted(refAASeq.items()) :
            for id, s in enumerate(ss) :
                cdss = re.findall('.{1000,}?X|.{1,1000}$', s + 'X')
                cdss[-1] = cdss[-1][:-1]
                cdsi = np.cumsum([0]+list(map(len, cdss[:-1])))
                for ci, cs in zip(cdsi, cdss) :
                    if len(cs) :
                        toWrite.append('>{0}:{1}:{2}\n{3}\n'.format(n, id+1, ci, cs))
        
        for id in xrange(5) :
            with open('{0}.{1}'.format(refAA, id), 'w') as fout :
                for line in toWrite[id::5] :
                    fout.write(line)
            diamond_cmd = '{diamond} blastp --no-self-hits --threads {n_thread} --db {refAA} --query {qryAA} --out {aaMatch} --id {min_id} --query-cover {min_ratio} --evalue 1 -k {nhits} --dbsize 5000000 --outfmt 101'.format(
                diamond=diamond, refAA='{0}.{1}'.format(refAA, id), qryAA=qryAA, aaMatch='{0}.{1}'.format(aaMatch, id), n_thread=self.n_thread, min_id=self.min_id*100., nhits=nhits, min_ratio=self.min_ratio*100.)
            Popen(diamond_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate()
        blastab = []
        for r in self.pool.imap_unordered(parseDiamond, [ ['{0}.{1}'.format(aaMatch, id), self.refSeq, self.qrySeq, self.min_id, self.min_cov, self.min_ratio] for id in xrange(5) ]) :
            if r is not None :
                blastab.append(np.load(r, allow_pickle=True))
                os.unlink(r)
        blastab = np.vstack(blastab)
        logger('Run diamond finishes. Got {0} alignments'.format(blastab.shape[0]))
        return blastab
Пример #2
0
    def get_allele_info(allele_file):
        if os.path.isfile(allele_file + '.stat'):
            return json.load(open(allele_file + '.stat'))
        alleles = readFasta(allele_file)
        allele_aa = transeq(alleles)
        allele_stat = {}
        for n, s in alleles.items():
            locus, allele_id = n.rsplit('_', 1)
            if locus not in allele_stat:
                allele_stat[locus] = {}

            if len(s) % 3 > 0:
                pseudo = 2  # frameshift
            else:
                aa = allele_aa.get(n + '_1', 'A')
                if aa[:-1].find('X') >= 0:
                    pseudo = 3  # premature
                elif s[:3] not in ('ATG', 'GTG', 'TTG'):
                    pseudo = 4  # no start
                elif aa[-1] != 'X':
                    pseudo = 5  # no stop
                else:
                    pseudo = 6  # intact
            allele_stat[locus][
                allele_id] = int(allele_id) * 1000000 + len(s) * 10 + pseudo
        json.dump(allele_stat, open(allele_file + '.stat', 'w'))
        return allele_stat
Пример #3
0
 def write_refsets(self, reference):
     ref_aa = '{0}.refset.aa'.format(parameters['unique_key'])
     refseq = self.readSequence(reference)
     refamino = transeq({n: s[0] for n, s in refseq.iteritems()}, 1)
     with open(ref_aa, 'w') as fout:
         for n, s in refamino.iteritems():
             if s[:-1].find('X') == -1:
                 fout.write('>{0}\n{1}\n'.format(n, s))
     return ref_aa
Пример #4
0
def MLSTdb():
    for arg in sys.argv[1:]:
        if arg.find('=') >= 0:
            k, v = arg.split('=', 1)
            if k in parameters:
                parameters[k] = v
        else:
            parameters['fasta'].append(arg)

    alleles = readFastaToList(parameters['fasta'])
    loci = {allele_id.rsplit('_', 1)[0]: [] for allele_id, seq in alleles}
    for allele_id, seq in alleles:
        locus, id = allele_id.rsplit('_', 1)
        loci[locus].append([id, seq])
    del alleles

    with open('{0}.refset.fna'.format(parameters['prefix']), 'w') as refout:
        for locus, alleles in loci.iteritems():
            with open('{0}.refset'.format(parameters['prefix']), 'w') as fout:
                id, seq = alleles[0]
                fout.write('>{0}\n{1}\n'.format(id, seq))
            with open('{0}.alleles'.format(parameters['prefix']), 'w') as fout:
                for id, seq in alleles[1:]:
                    fout.write('>{0}\n{1}\n'.format(id, seq))

            format_cmd = '{formatdb} -dbtype nucl -in {prefix}.refset'.format(
                **parameters)
            Popen(format_cmd.split(), stderr=PIPE, stdout=PIPE).communicate()
            blast_cmd = '{blast} -db {prefix}.refset -query {prefix}.alleles -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue score qlen slen" -num_threads 6 -task blastn -evalue 1e-3 -dbsize 5000000 -reward 2 -penalty -2 -gapopen 6 -gapextend 2'.format(
                **parameters)
            p = Popen(blast_cmd, stderr=PIPE, stdout=PIPE, shell=True)

            ids = {alleles[0][0]: 1}
            for line in p.stdout:
                p = np.array(line.strip().split(), dtype=float)
                if p[6] - 1 < 10 and p[6] == p[8] and p[12] - p[7] < 10 and p[
                        13] - p[9] == p[12] - p[7]:
                    ids[str(int(p[0]))] = 1
            with open('{0}.alleles'.format(parameters['prefix']), 'w') as fout:
                for id, seq in alleles:
                    if id in ids:
                        fout.write('>{0}_{1}\n{2}\n'.format(locus, id, seq))
            outfile = mmseq_cluster(parameters['prefix'],
                                    parameters['prefix'] + '.alleles',
                                    parameters['id'])
            with open(outfile) as fin:
                for line in fin:
                    refout.write(line)
    refseq = readFastaToList('{0}.refset.fna'.format(parameters['prefix']))
    ref_aa = transeq(dict(refseq))
    with open('{0}.refset.faa'.format(parameters['prefix']), 'w') as fout:
        for n, s in ref_aa.iteritems():
            if s[:-1].find('X') < 0:
                fout.write('>{0}\n{1}\n'.format(n, s))
    return '{0}.refset.fna'.format(
        parameters['prefix']), '{0}.refset.faa'.format(parameters['prefix'])
Пример #5
0
 def write_query(self, query):
     fna, faa = '{0}.query.na'.format(
         parameters['unique_key']), '{0}.query.aa'.format(
             parameters['unique_key'])
     qryseq = self.readSequence(query)
     qryamino = transeq({n: s[0] for n, s in qryseq.iteritems()}, frame=7)
     with open(fna, 'w') as fout:
         for n, s in qryseq.iteritems():
             fout.write('>{0}\n{1}\n'.format(n, s[0]))
     with open(faa, 'w') as fout:
         for n, s in qryamino.iteritems():
             fout.write('>{0}\n{1}\n'.format(n, s))
     return qryseq, fna, faa
Пример #6
0
def checkCDS(n, s):
    if len(s) < params['min_cds']:
        logger('{0} is too short'.format(n))
        return False
    if params['incompleteCDS']:
        return True

    if len(s) % 3 > 0:
        logger('{0} is discarded due to frameshifts'.format(n))
        return False
    aa = transeq({'n': s.upper()}, frame=1, transl_table='starts')['n'][0]
    if aa[0] != 'M':
        logger('{0} is discarded due to lack of start codon'.format(n))
        return False
    if aa[-1] != 'X':
        logger('{0} is discarded due to lack of stop codon'.format(n))
        return False
    if len(aa[:-1].split('X')) > 1:
        logger('{0} is discarded due to internal stop codons'.format(n))
        return False
    return True
Пример #7
0
    def get_allele_info(alleles):
        allele_aa = transeq(alleles)
        allele_stat = {}
        for n, s in alleles.iteritems():
            locus, allele_id = n.rsplit('_', 1)
            if locus not in allele_stat:
                allele_stat[locus] = {}

            if len(s) % 3 > 0:
                pseudo = 2  # frameshift
            else:
                aa = allele_aa.get(n + '_1', 'A')
                if aa[:-1].find('X') >= 0:
                    pseudo = 3  # premature
                elif s[:3] not in ('ATG', 'GTG', 'TTG'):
                    pseudo = 4  # no start
                elif aa[-1] != 'X':
                    pseudo = 5  # no stop
                else:
                    pseudo = 6  # intact
            allele_stat[locus][allele_id] = [len(s), pseudo]
        return allele_stat
Пример #8
0
    def runUBlast(self, ref, qry, nhits=6, frames='7'):
        logger('Run uBLAST starts')

        def parseUBlast(fin, refseq, qryseq, min_id, min_cov, min_ratio):
            blastab = pd.read_csv(fin, sep='\t', header=None)
            blastab[2] /= 100.
            blastab = blastab[blastab[2] >= min_id]
            blastab[3], blastab[4] = blastab[3] * 3, blastab[4] * 3

            qf, rf = blastab[0].str.rsplit(
                ':', 1, expand=True), blastab[1].str.rsplit(':',
                                                            1,
                                                            expand=True)
            if np.all(qf[0].str.isdigit()):
                qf[0] = qf[0].astype(int)
            if np.all(rf[0].str.isdigit()):
                rf[0] = rf[0].astype(int)
            blastab[0], qf = qf[0], qf[1].astype(int)
            blastab[1], rf = rf[0], rf[1].astype(int)
            blastab[6], blastab[
                7] = blastab[6] * 3 + qf - 3, blastab[7] * 3 + qf - 1
            blastab[14] = [[
                [3 * vv[0], vv[1]] for vv in v
            ] for v in map(getCIGAR, zip(blastab[15], blastab[14]))]

            blastab[12], blastab[13] = blastab[0].apply(lambda x: len(qryseq[
                str(x)])), blastab[1].apply(lambda x: len(refseq[str(x)]))

            rf3 = (rf <= 3)
            blastab.loc[rf3,
                        8], blastab.loc[rf3, 9] = blastab.loc[rf3, 8] * 3 + rf[
                            rf3] - 3, blastab.loc[rf3, 9] * 3 + rf[rf3] - 1
            blastab.loc[~rf3, 8], blastab.loc[
                ~rf3, 9] = blastab.loc[~rf3, 13] - (
                    blastab.loc[~rf3, 8] * 3 + rf[~rf3] - 3 -
                    3) + 1, blastab.loc[~rf3, 13] - (blastab.loc[~rf3, 9] * 3 +
                                                     rf[~rf3] - 3 - 1) + 1
            d = np.max([
                blastab[7] - blastab[12], blastab[9] - blastab[13],
                1 - blastab[9],
                np.zeros(blastab.shape[0], dtype=int)
            ],
                       axis=0)
            blastab[7] -= d

            def ending(x, y):
                x[-1][0] -= y

            np.vectorize(ending)(blastab[14], d)
            d[~rf3] *= -1
            blastab[9] -= d
            blastab = blastab[
                (blastab[7] - blastab[6] + 1 >= min_ratio * blastab[12])
                & (blastab[7] - blastab[6] + 1 >= min_cov)]
            return blastab.drop(columns=[15, 16])

        refAA = os.path.join(self.dirPath, 'refAA')
        qryAA = os.path.join(self.dirPath, 'qryAA')
        aaMatch = os.path.join(self.dirPath, 'aaMatch')

        if not self.qrySeq:
            self.qrySeq, self.qryQual = readFastq(qry)
        if not self.refSeq:
            self.refSeq, self.refQual = readFastq(ref)

        qryAASeq = transeq(self.qrySeq, frame='F')
        with open(qryAA, 'w') as fout:
            for n, ss in sorted(qryAASeq.items()):
                _, id, s = min([(len(s[:-1].split('X')), id, s)
                                for id, s in enumerate(ss)])
                fout.write('>{0}:{1}\n{2}\n'.format(n, id + 1, s))

        refAASeq = transeq(self.refSeq, frames)
        toWrite = []
        for n, ss in sorted(refAASeq.items()):
            for id, s in enumerate(ss):
                toWrite.append('>{0}:{1}\n{2}\n'.format(n, id + 1, s))

        blastab = []
        for id in xrange(5):
            with open(refAA, 'w') as fout:
                for line in toWrite[id::4]:
                    fout.write(line)

            ublast_cmd = '{usearch} -self -threads {n_thread} -db {refAA} -ublast {qryAA} -mid {min_id} -query_cov {min_ratio} -evalue 1 -accel 0.9 -maxhits {nhits} -userout {aaMatch} -ka_dbsize 5000000 -userfields query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+raw+ql+tl+qrow+trow+qstrand'.format(
                usearch=usearch,
                refAA=refAA,
                qryAA=qryAA,
                aaMatch=aaMatch,
                n_thread=self.n_thread,
                min_id=self.min_id * 100.,
                nhits=nhits,
                min_ratio=self.min_ratio)
            p = Popen(ublast_cmd.split(),
                      stderr=PIPE,
                      stdout=PIPE,
                      universal_newlines=True).communicate()
            if os.path.getsize(aaMatch) > 0:
                blastab.append(
                    parseUBlast(open(aaMatch), self.refSeq, self.qrySeq,
                                self.min_id, self.min_cov, self.min_ratio))
        blastab = pd.concat(blastab)
        logger('Run uBLAST finishes. Got {0} alignments'.format(
            blastab.shape[0]))
        return blastab
Пример #9
0
def getClust(prefix, genes, params):
    groups = {}
    dirPath = tempfile.mkdtemp(prefix='NS_', dir='.')
    try:
        if not params['translate']:
            geneFile = genes
        else:
            na_seqs = readFasta(genes)
            aa_seqs = transeq(na_seqs, frame='1', transl_table='starts')
            with open(os.path.join(dirPath, 'seq.aa'), 'w') as fout:
                for n, s in aa_seqs:
                    fout.write('>{0}\n{1}\n'.format(n, s[0]))
            geneFile = os.path.join(dirPath, 'seq.aa')
        seqDb = os.path.join(dirPath, 'seq.db')
        tmpDb = os.path.join(dirPath, 'tmp')
        lcDb = os.path.join(dirPath, 'seq.lc')
        tabFile = os.path.join(dirPath, 'clust.tab')
        refFile = os.path.join(dirPath, 'seq.ref')

        nRef = 999999999999999
        for ite in xrange(3):
            if os.path.isdir(tmpDb):
                shutil.rmtree(tmpDb)
            os.makedirs(tmpDb)
            if os.path.isfile(seqDb):
                list(map(os.unlink, glob.glob(seqDb + '*')))
            if os.path.isfile(lcDb):
                list(map(os.unlink, glob.glob(lcDb + '*')))
            subprocess.Popen('{0} createdb {2} {1} -v 0'.format(
                externals['mmseqs'], seqDb, geneFile).split()).communicate()
            subprocess.Popen('{0} linclust {1} {2} {3} --min-seq-id {4} -c {5} --threads {6} -v 0'.format( \
                externals['mmseqs'], seqDb, lcDb, tmpDb, params['identity'], params['coverage'], params['n_thread']).split(), stdout=subprocess.PIPE).communicate()
            subprocess.Popen('{0} createtsv {1} {1} {2} {3}'.format(\
                externals['mmseqs'], seqDb, lcDb, tabFile).split(), stdout = subprocess.PIPE).communicate()
            with open(tabFile) as fin:
                for line in fin:
                    part = line.strip().split()
                    groups[part[1]] = part[0]
            tmp = []
            with open(geneFile) as fin:
                toWrite, used_grps = False, {None: 1}
                for line in fin:
                    if line.startswith('>'):
                        name = line[1:].strip().split()[0]
                        grp = groups.get(name, None)
                        toWrite = False if grp in used_grps else True
                        if toWrite:
                            used_grps[grp] = name
                    if toWrite:
                        tmp.append(line)
                for gene, grp in groups.items():
                    if grp in used_grps:
                        groups[gene] = used_grps[grp]
            with open(refFile, 'w') as fout:
                for line in tmp:
                    fout.write(line)
            if nRef <= len(used_grps):
                break
            nRef = len(used_grps)
            geneFile = refFile
        if not params['translate']:
            shutil.copy2(refFile, '{0}.clust.exemplar'.format(prefix))
        else:
            rSeq = readFasta(refFile)
            na_seqs = dict(na_seqs)
            with open('{0}.clust.exemplar'.format(prefix), 'w') as fout:
                for n, s in rSeq:
                    fout.write('>{0}\n{1}\n'.format(n, na_seqs[n]))
    finally:
        shutil.rmtree(dirPath)
    with open('{0}.clust.tab'.format(prefix), 'w') as fout:
        for gene, grp in sorted(groups.items()):
            g = gene
            while g != grp:
                g, grp = grp, groups[grp]
            groups[gene] = grp
            fout.write('{0}\t{1}\n'.format(gene, grp))

    return '{0}.clust.exemplar'.format(prefix), '{0}.clust.tab'.format(prefix)
Пример #10
0
def write_output(prefix, prediction, genomes, clust_ref, old_prediction):
    predictions, alleles = {}, {}

    allele_file = open('{0}.allele.fna'.format(prefix), 'w')
    prediction = pd.read_csv(prediction, sep='\t', header=None).values
    for part in prediction:
        #with open(prediction) as fin :
        #for line in fin :
        #part = line.strip().split()
        if part[0] not in alleles:
            alleles[part[0]] = {clust_ref[part[0]]: 1}
            allele_file.write('>{0}_{1}\n{2}\n'.format(part[0], 1,
                                                       clust_ref[part[0]]))

        if part[9] < part[10]:
            l, r, d = min(part[7] - 1,
                          part[9] - 1), min(part[12] - part[8],
                                            part[13] - part[10]), 1
        else:
            l, r, d = min(part[7] - 1,
                          part[13] - part[9]), min(part[12] - part[8],
                                                   part[10] - 1), -1
        if l <= 6 and part[7] - l == 1:
            part[7], part[9] = part[7] - l, part[9] - l * d
        else:
            ll = (part[7] - 1) % 3
            if ll > 0:
                part[7], part[9] = part[7] + 3 - ll, part[9] + (3 - ll) * d
        if r <= 6 and part[8] + r == part[12]:
            part[8], part[10] = part[8] + r, part[10] + r * d
        else:
            rr = (part[12] - part[8]) % 3
            if rr > 0:
                part[8], part[10] = part[8] - 3 + rr, part[10] - (3 + rr) * d

        if part[9] < part[10]:
            part[9:12] = part[9], part[10], '+'
        else:
            part[9:12] = part[10], part[9], '-'

        if part[4] not in predictions:
            predictions[part[4]] = []
        elif predictions[part[4]][-1][2] == part[2]:
            prev = predictions[part[4]][-1]
            if prev[5] == part[5] and part[7] - prev[8] < 500:
                if part[11] == '+' and part[9] - prev[10] < 500:
                    prev[8], prev[10] = part[8], part[10]
                    continue
                elif part[11] == '-' and prev[9] - part[10] < 500:
                    prev[8], prev[9] = part[8], part[9]
                    continue
            predictions[part[4]][-1][1], part[1] = -1, -1
        predictions[part[4]].append(part)

    op = ['', 0, []]
    with open('{0}.EToKi.gff'.format(prefix), 'w') as fout:
        for gid, (g, predict) in enumerate(predictions.items()):
            predict.sort(key=itemgetter(5, 9, 10))
            for pid, pred in enumerate(predict):
                if pred[1] == -1 or (pred[10] - pred[9] + 1) <= 0.8 * pred[12]:
                    cds, allele_id = 'fragment:{0:.2f}%'.format(
                        (pred[10] - pred[9] + 1) * 100 / pred[12]), 'uncertain'
                    start, stop = pred[9:11]
                else:
                    s, e = pred[9:11]
                    if pred[11] == '+':
                        s2, e2 = s - min(int(3 * ((s - 1) / 3)), 60), e + min(
                            3 * int((pred[13] - e) / 3), 600)
                        seq = genomes[pred[5]][1][(s2 - 1):e2]
                        lp, rp = s - s2, e2 - e
                    else:
                        s2, e2 = s - min(int(3 * ((s - 1) / 3)), 600), e + min(
                            3 * int((pred[13] - e) / 3), 60)
                        seq = rc(genomes[pred[5]][1][(s2 - 1):e2])
                        rp, lp = s - s2, e2 - e

                    seq2 = seq[(lp):(len(seq) - rp)]
                    if seq2 not in alleles[pred[0]]:
                        if pred[3] == pred[0] and pred[7] == 1 and pred[
                                8] == pred[12]:
                            alleles[pred[0]][seq2] = len(alleles[pred[0]]) + 1
                        else:
                            alleles[pred[0]][seq2] = 'LowQ{0}'.format(
                                len(alleles[pred[0]]) + 1)
                        allele_id = str(alleles[pred[0]][seq2])
                        allele_file.write('>{0}_{1}\n{2}\n'.format(
                            pred[0], allele_id, seq2))
                    else:
                        allele_id = str(alleles[pred[0]][seq2])

                    frames = sorted(set([0, len(seq) % 3]))
                    for frame, aa_seq in zip(
                            frames,
                            transeq({'n': seq},
                                    transl_table='starts',
                                    frame=','.join(
                                        [str(f + 1) for f in frames]))['n']):
                        cds = 'CDS'
                        s0, s1 = aa_seq.find('M', int(lp / 3),
                                             int(lp / 3 + 30)), aa_seq.rfind(
                                                 'M', 0, int(lp / 3))
                        start = s0 if s0 >= 0 else s1
                        if start < 0:
                            cds, start = 'nostart', int(lp / 3)
                        stop = aa_seq.find('X', start)
                        if 0 <= stop < lp / 3 + 30:
                            s0 = aa_seq.find('M', stop, int(lp / 3 + 30))
                            if s0 >= 0:
                                start = s0
                                stop = aa_seq.find('X', start)
                        if stop < 0:
                            cds = 'nostop'
                        elif (stop - start + 1) * 3 <= 0.8 * pred[12]:
                            cds = 'premature stop:{0:.2f}%'.format(
                                (stop - start + 1) * 300 / pred[12])

                        if cds == 'CDS':
                            if pred[11] == '+':
                                start, stop = s2 + start * 3 + frame, s2 + stop * 3 + 2 + frame
                            else:
                                start, stop = e2 - stop * 3 - 2 - frame, e2 - start * 3 - frame
                            break
                        else:
                            start, stop = s, e
                            if frame > 0:
                                cds = 'frameshift'

                if pred[5] != op[0]:
                    op = [pred[5], 0, old_prediction.get(pred[5], [])]
                old_tag = []
                for k in xrange(op[1], len(op[2])):
                    opd = op[2][k]
                    if opd[2] < start:
                        op[1] = k + 1
                    elif opd[1] > stop:
                        break
                    elif opd[3] != pred[11]:
                        continue
                    ovl = min(opd[2], stop) - max(opd[1], start) + 1
                    if ovl >= 300 or ovl >= 0.6 * (
                            opd[2] - opd[1] + 1) or ovl >= 0.6 * (stop -
                                                                  start + 1):
                        frame = min((opd[1] - start) % 3, (opd[2] - stop) % 3)
                        if frame == 0:
                            old_tag.append('{0}:{1}-{2}'.format(*opd))

                fout.write(
                    '{0}\t{1}\tEToKi-ortho\t{2}\t{3}\t.\t{4}\t.\tID={5};{12}inference=ortholog group:{6},allele ID:{7},matched region:{8}-{9}{10}{11}\n'
                    .format(
                        pred[5],
                        'CDS' if cds == 'CDS' else 'pseudogene',
                        start,
                        stop,
                        pred[11],
                        '{0}_{1}_{2}'.format(prefix, gid, pid),
                        pred[0],
                        allele_id,
                        s,
                        e,
                        '' if pred[0] == pred[3] else
                        ',structure variant group:' + pred[3],
                        '' if cds == 'CDS' else ';pseudogene=' + cds,
                        '' if len(old_tag) == 0 else 'locus_tag={0};'.format(
                            ','.join(old_tag)),
                    ))
    allele_file.close()
    return
Пример #11
0
    def runDiamond(self, ref, qry, nhits=10, frames='7'):
        logger('Run diamond starts')

        def parseDiamond(fin, refseq, qryseq, min_id, min_cov, min_ratio):
            blastab = []
            for line in fin:
                if line.startswith('@'):
                    continue
                part = line.strip().split('\t')
                if part[2] == '*': continue
                qn, qf = part[0].rsplit(':', 1)
                rn, rf, rx = part[2].rsplit(':', 2)
                rs = int(part[3]) + int(rx)
                ql, rl = len(qryseq[str(qn)]), len(refseq[str(rn)])
                qm = len(part[9])
                if qm * 3 < min_cov: continue
                cov_ratio = qm * 3. / ql
                if cov_ratio < min_ratio: continue
                cigar = [[int(n) * 3, t]
                         for n, t in re.findall(r'(\d+)([A-Z])', part[5])]
                cl = np.sum([c[0] for c in cigar])
                variation = float(part[12][5:]) * 3 if part[12].startswith(
                    'NM:') else float(re.findall('NM:i:(\d+)', line)[0]) * 3

                iden = 1 - round(variation / cl, 3)
                if iden < min_id: continue
                qf, rf = int(qf), int(rf)
                qs = int(part[18][5:]) if part[18].startswith('ZS:') else int(
                    re.findall('ZS:i:(\d+)', line)[0])

                rm = int(
                    np.sum([c[0] for c in cigar if c[1] in {'M', 'D'}]) / 3)
                if rf <= 3:
                    rs, r_e = rs * 3 + rf - 3, (rs + rm - 1) * 3 + rf - 1
                else:
                    rs, r_e = rl - (rs * 3 + rf - 6) + 1, rl - (
                        (rs + rm - 1) * 3 + rf - 4) + 1
                if qf <= 3:
                    qs, qe = qs * 3 + qf - 3, (qs + qm - 1) * 3 + qf - 1
                else:
                    qs, qe = ql - (qs * 3 + qf - 6) + 1, ql - (
                        (qs + qm - 1) * 3 + qf - 4) + 1
                    qs, qe, rs, r_e = qe, qs, r_e, rs
                    cigar = list(reversed(cigar))

                cd = [c[0] for c in cigar if c[1] != 'M']
                score = int(
                    part[14][5:]) if part[14].startswith('ZR:') else int(
                        re.findall('ZR:i:(\d+)', line)[0])
                blastab.append([
                    qn, rn, iden, cl,
                    int(variation - sum(cd)),
                    len(cd), qs, qe, rs, r_e, 0.0, score, ql, rl, cigar
                ])
            blastab = pd.DataFrame(blastab)
            blastab[[0, 1]] = blastab[[0, 1]].astype(str)
            return blastab

        refAA = os.path.join(self.dirPath, 'refAA')
        qryAA = os.path.join(self.dirPath, 'qryAA')
        aaMatch = os.path.join(self.dirPath, 'aaMatch')

        if not self.qrySeq:
            self.qrySeq, self.qryQual = readFastq(qry)
        if not self.refSeq:
            self.refSeq, self.refQual = readFastq(ref)

        qryAASeq = transeq(self.qrySeq, frame='F', transl_table=self.table_id)
        with open(qryAA, 'w') as fout:
            for n, ss in sorted(qryAASeq.items()):
                _, id, s = min([(len(s[:-1].split('X')), id, s)
                                for id, s in enumerate(ss)])
                fout.write('>{0}:{1}\n{2}\n'.format(n, id + 1, s))

        diamond_fmt = '{diamond} makedb --db {qryAA} --in {qryAA}'.format(
            diamond=diamond, qryAA=qryAA)
        p = Popen(diamond_fmt.split(),
                  stderr=PIPE,
                  stdout=PIPE,
                  universal_newlines=True).communicate()

        refAASeq = transeq(self.refSeq, frames, transl_table=self.table_id)
        toWrite = []
        for n, ss in sorted(refAASeq.items()):
            for id, s in enumerate(ss):
                cdss = re.findall('.{1000,}?X|.{1,1000}$', s + 'X')
                cdss[-1] = cdss[-1][:-1]
                cdsi = np.cumsum([0] + list(map(len, cdss[:-1])))
                for ci, cs in zip(cdsi, cdss):
                    if len(cs):
                        toWrite.append('>{0}:{1}:{2}\n{3}\n'.format(
                            n, id + 1, ci, cs))

        blastab = []
        for id in xrange(5):
            #logger('{0}'.format(id))
            with open(refAA, 'w') as fout:
                for line in toWrite[id::5]:
                    fout.write(line)
            diamond_cmd = '{diamond} blastp --no-self-hits --threads {n_thread} --db {refAA} --query {qryAA} --out {aaMatch} --id {min_id} --query-cover {min_ratio} --evalue 1 -k {nhits} --dbsize 5000000 --outfmt 101'.format(
                diamond=diamond,
                refAA=refAA,
                qryAA=qryAA,
                aaMatch=aaMatch,
                n_thread=self.n_thread,
                min_id=self.min_id * 100.,
                nhits=nhits,
                min_ratio=self.min_ratio * 100.)
            p = Popen(diamond_cmd.split(),
                      stdout=PIPE,
                      stderr=PIPE,
                      universal_newlines=True).communicate()
            if os.path.getsize(aaMatch) > 0:
                tab = parseDiamond(open(aaMatch), self.refSeq, self.qrySeq,
                                   self.min_id, self.min_cov, self.min_ratio)
                os.unlink(aaMatch)
            if tab is not None:
                blastab.append(tab)
        blastab = pd.concat(blastab)
        logger('Run diamond finishes. Got {0} alignments'.format(
            blastab.shape[0]))
        return blastab