Exemplo n.º 1
0
 def runBlast(self, ref, qry) :
     logger('Run BLASTn starts')
     if not self.qrySeq :
         self.qrySeq, self.qryQual = readFastq(qry)
     if not self.refSeq :
         self.refSeq, self.refQual = readFastq(ref)
     refDb = refNA = os.path.join(self.dirPath, 'refNA')
     with open(refNA, 'w') as fout :
         for n,s in self.refSeq.items() :
             fout.write('>{0}\n{1}\n'.format(n, s))
     Popen('{makeblastdb} -dbtype nucl -in {refNA} -out {refDb}'.format(makeblastdb=makeblastdb, refNA=refNA, refDb = refDb).split(), stderr=PIPE, stdout=PIPE, universal_newlines=True).communicate()
     qrySeq = sorted(list(self.qrySeq.items()), key=lambda s:-len(s[1]))
     qrys = [ os.path.join(self.dirPath, 'qryNA.{0}'.format(id)) for id in range(min(len(qrySeq), self.n_thread))]
     for id, q in enumerate(qrys) :
         with open(q, 'w') as fout :
             for n, s in qrySeq[id::self.n_thread] :
                 fout.write('>{0}\n{1}\n'.format(n, s))
     blastab = []
     for r in self.pool.imap_unordered(poolBlast, [ [blastn, refDb, q, self.min_id, self.min_cov, self.min_ratio] for q in qrys ]) :
         if r is not None :
             blastab.append(np.load(r, allow_pickle=True))
             os.unlink(r)
     if len(blastab) :
         blastab = np.vstack(blastab)
     else :
         blastab = np.empty([0, 15], dtype=object)
     logger('Run BLASTn finishes. Got {0} alignments'.format(blastab.shape[0]))
     return blastab
Exemplo n.º 2
0
    def reScore(self, ref, qry, blastab, mode, perBatch=10000):
        if not self.qrySeq:
            self.qrySeq, self.qryQual = readFastq(qry)
        if not self.refSeq:
            self.refSeq, self.refQual = readFastq(ref)
        for k, v in self.qrySeq.items():
            self.qrySeq[k] = nucEncoder[np.array(list(v)).view(asc2int)]
        for k, v in self.refSeq.items():
            self.refSeq[k] = nucEncoder[np.array(list(v)).view(asc2int)]

        nTab = len(blastab)
        for bId in xrange(0, blastab.shape[0], perBatch):
            logger('Update scores: {0} / {1}'.format(bId, nTab))
            tabs = blastab[bId:bId + perBatch]
            #scores = np.array([ cigar2score([t[14], self.refSeq[str(t[1])][t[8]-1:t[9]] if t[8] < t[9] else 4 - self.refSeq[str(t[1])][t[9]-1:t[8]][::-1], self.qrySeq[str(t[0])][t[6]-1:t[7]], t[6], mode, 6, 1]) for t in tabs ])
            scores = np.array(
                list(
                    map(cigar2score, ([
                        t[14],
                        self.refSeq[str(t[1])][t[8] - 1:t[9]] if t[8] < t[9]
                        else 4 - self.refSeq[str(t[1])][t[9] - 1:t[8]][::-1],
                        self.qrySeq[str(t[0])][t[6] - 1:t[7]], t[6], mode, 6, 1
                    ] for t in tabs))))
            tabs.T[2], tabs.T[11] = scores.T
        return blastab
Exemplo n.º 3
0
    def runDiamond(self, ref, qry, nhits=10, frames='7') :
        logger('Run diamond starts')

        refAA = os.path.join(self.dirPath, 'refAA')
        qryAA = os.path.join(self.dirPath, 'qryAA')
        aaMatch = os.path.join(self.dirPath, 'aaMatch')
        
        if not self.qrySeq :
            self.qrySeq, self.qryQual = readFastq(qry)
        if not self.refSeq :
            self.refSeq, self.refQual = readFastq(ref)

        qryAASeq = transeq(self.qrySeq, frame='F', transl_table=self.table_id)
        with open(qryAA, 'w') as fout :
            for n, ss in sorted(qryAASeq.items()) :
                _, id, s = min([ (len(s[:-1].split('X')), id, s) for id, s in enumerate(ss) ])
                fout.write('>{0}:{1}\n{2}\n'.format(n, id+1, s))
        
        diamond_fmt = '{diamond} makedb --db {qryAA} --in {qryAA}'.format(
            diamond=diamond, qryAA=qryAA)
        p = Popen(diamond_fmt.split(), stderr=PIPE, stdout=PIPE, universal_newlines=True).communicate()
        
        refAASeq = transeq(self.refSeq, frames, transl_table=self.table_id)
        toWrite = []
        for n, ss in sorted(refAASeq.items()) :
            for id, s in enumerate(ss) :
                cdss = re.findall('.{1000,}?X|.{1,1000}$', s + 'X')
                cdss[-1] = cdss[-1][:-1]
                cdsi = np.cumsum([0]+list(map(len, cdss[:-1])))
                for ci, cs in zip(cdsi, cdss) :
                    if len(cs) :
                        toWrite.append('>{0}:{1}:{2}\n{3}\n'.format(n, id+1, ci, cs))
        
        for id in xrange(5) :
            with open('{0}.{1}'.format(refAA, id), 'w') as fout :
                for line in toWrite[id::5] :
                    fout.write(line)
            diamond_cmd = '{diamond} blastp --no-self-hits --threads {n_thread} --db {refAA} --query {qryAA} --out {aaMatch} --id {min_id} --query-cover {min_ratio} --evalue 1 -k {nhits} --dbsize 5000000 --outfmt 101'.format(
                diamond=diamond, refAA='{0}.{1}'.format(refAA, id), qryAA=qryAA, aaMatch='{0}.{1}'.format(aaMatch, id), n_thread=self.n_thread, min_id=self.min_id*100., nhits=nhits, min_ratio=self.min_ratio*100.)
            Popen(diamond_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate()
        blastab = []
        for r in self.pool.imap_unordered(parseDiamond, [ ['{0}.{1}'.format(aaMatch, id), self.refSeq, self.qrySeq, self.min_id, self.min_cov, self.min_ratio] for id in xrange(5) ]) :
            if r is not None :
                blastab.append(np.load(r, allow_pickle=True))
                os.unlink(r)
        blastab = np.vstack(blastab)
        logger('Run diamond finishes. Got {0} alignments'.format(blastab.shape[0]))
        return blastab
Exemplo n.º 4
0
def prepReference(prefix, ref_tag, reference, aligner, pilercr, trf, **args) :
    def mask_tandem(fasta_file) :
        cmd = '{0} {1} 2 4 7 80 10 60 2000 -d -h -ngs'.format(trf, fasta_file)
        trf_run = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, universal_newlines=True)
    
        region = []
        for line in iter(trf_run.stdout.readline, r'') :
            if line[0] == '@' :
                cont_name = line[1:].strip().split()[0]
            else :
                part = line.split(' ',2)[:2]
                region.append([cont_name, int(part[0])-2, int(part[1])+2])
        return region
    
    def mask_crispr(fasta_file, prefix) :
        cmd = '{0} -in {1} -out {2}.crispr'.format(pilercr, fasta_file, prefix)
        subprocess.Popen(cmd.split(), stderr=subprocess.PIPE).communicate()
        summary_trigger = 0
    
        region = []
        with open('{0}.crispr'.format(prefix)) as fin :
            for line in fin :
                if line.startswith('SUMMARY BY POSITION') :
                    summary_trigger = 1
                elif summary_trigger :
                    if line[0] == '>' :
                        cont_name = line[1:].strip().split()[0]
                    elif len(line) > 10 and line.strip()[0] in '0123456789' :
                        part = line[24:].strip().split()
                        region.append([cont_name, int(part[0]), int(part[0]) + int(part[1]) -1])
        os.unlink('{0}.crispr'.format(prefix))
        return region
    # prepare reference
    if reference :
        if not isinstance(aligner, list) :
            subprocess.Popen('{0} -k15 -w5 -d {2}.mmi {1}'.format(aligner, reference, prefix).split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
        else :
            subprocess.Popen('{0} -cR01 {2}.mmi {1}'.format(aligner[0], reference, prefix).split()).communicate()
        import tempfile
        with tempfile.NamedTemporaryFile(dir='.') as tf :
            seq, _ = readFastq(reference)
            tf_fas = '{0}.fasta'.format(tf.name)
            with open(tf_fas, 'wt') as fout:
                for n, s in seq.items() :
                    fout.write('>{0}\n{1}\n'.format(n, s))
            #tf_fas = '{0}.fasta'.format(tf.name)
            #if reference.upper().endswith('GZ') :
            #    subprocess.Popen('{0} -cd {1} > {2}'.format(externals['pigz'], reference, tf_fas), shell=True).communicate()
            #else :
            #    subprocess.Popen('cp {1} {2}'.format(externals['pigz'], reference, tf_fas), shell=True).communicate()
            repeats = mask_tandem(tf_fas) + mask_crispr(tf_fas, tf.name)
            os.unlink(tf_fas)
        alignments = alignAgainst([prefix +'.' + ref_tag.rsplit('.', 1)[0] + '.0', aligner, prefix + '.mmi', [ref_tag, reference], [ref_tag, reference]])
        with uopen(alignments[1], 'a') as fout :
            for r in repeats :
                fout.write('{0}\trefMapper\tunsure\t{1}\t{2}\t.\t+\t.\t/inference="repetitive_regions"\n'.format(
                    r[0], r[1], r[2], 
                ))
    return alignments
Exemplo n.º 5
0
 def runBlast(self, ref, qry):
     logger('Run BLASTn starts')
     if not self.qrySeq:
         self.qrySeq, self.qryQual = readFastq(qry)
     if not self.refSeq:
         self.refSeq, self.refQual = readFastq(ref)
     refDb = refNA = os.path.join(self.dirPath, 'refNA')
     if self.refQual is not None:
         with open(refNA, 'w') as fout:
             for n, s in self.refSeq.items():
                 fout.write('>{0}\n{1}\n'.format(n, s))
     else:
         refNA = ref
     Popen('{makeblastdb} -dbtype nucl -in {refNA} -out {refDb}'.format(
         makeblastdb=makeblastdb, refNA=refNA, refDb=refDb).split(),
           stderr=PIPE,
           stdout=PIPE,
           universal_newlines=True).communicate()
     qrySeq = sorted(list(self.qrySeq.items()), key=lambda s: -len(s[1]))
     qrys = [
         os.path.join(self.dirPath, 'qryNA.{0}'.format(id))
         for id in range(min(len(qrySeq), self.n_thread))
     ]
     for id, q in enumerate(qrys):
         with open(q, 'w') as fout:
             for n, s in qrySeq[id::self.n_thread]:
                 fout.write('>{0}\n{1}\n'.format(n, s))
     res = self.pool.map(
         poolBlast,
         [[blastn, refDb, q, self.min_id, self.min_cov, self.min_ratio]
          for q in qrys])
     #res = list(map(poolBlast, [ [blastn, refDb, q, self.min_id, self.min_cov, self.min_ratio] for q in qrys ]))
     res = [r for r in res if r is not None]
     blastab = pd.DataFrame(
         np.vstack([pd.read_msgpack(r).values for r in res]))
     blastab[14] = [[list(t) for t in tab] for tab in blastab[14].tolist()]
     for r in res:
         os.unlink(r)
     logger('Run BLASTn finishes. Got {0} alignments'.format(
         blastab.shape[0]))
     return blastab
Exemplo n.º 6
0
 def run_lastal( refdb, query, output, lastal ) :
     cmd = '{0} -j4 -r1 -q2 -a7 -b1 {1} {2}'.format( lastal, refdb, query )
     lastal_run = subprocess.Popen( cmd.split(), stdout=subprocess.PIPE, universal_newlines=True )
     with open(output, 'w') as fout:
         fout.write(lastal_run.communicate()[0])
     if lastal_run.returncode != 0 :
         fastq = readFastq(query)
         with open(output+'.qry', 'w') as fout :
             for n, (s, q) in fastq.items() :
                 fout.write('@{0}\n{1}\n+\n{2}\n'.format(n, s, re.sub(r'[!"#$%&\']', '(', q)))
         cmd = '{0} -Q1 -j4 -r1 -q2 -a7 -b1 {1} {2}'.format( lastal, refdb, output + '.qry' )
         lastal_run = subprocess.Popen( cmd.split(), stdout=subprocess.PIPE )
         with open(output, 'w') as fout:
             fout.write(lastal_run.communicate()[0])
         os.unlink(output + '.qry')
     return output
Exemplo n.º 7
0
def getMatrix(prefix, reference, alignments, core, matrixOut, alignmentOut) :
    refSeq, refQual = readFastq(reference)
    coreSites = { n:np.zeros(len(refSeq[n]), dtype=int) for n in refSeq }
    matSites = { n:np.zeros(len(refSeq[n]), dtype=int) for n in refSeq }
    alnId = { aln[0]:id for id, aln in enumerate(alignments) }
    res = pool.map(readMap, alignments)
    
    matrix = {}
    for presences, absences, mutations in res :
        for mut in mutations :
            j = alnId[mut[0]]
            site = tuple(mut[1:3])
            if site not in matrix :
                matrix[site] = [[], []]
                matSites[mut[1]][mut[2]-1] = mut[2]
            if len(mut[4]) == 1 :
                if len(matrix[site][0]) == 0 :
                    matrix[site][0] = ['-' for id in alnId]
                matrix[site][0][j] = mut[4]
            else :
                if len(matrix[site][1]) == 0 :
                    matrix[site][1] = ['-' for id in alnId]
                matrix[site][1][j] = mut[4]
    for (mTag, mFile), (presences, absences, mutations) in zip(alignments, res) :
        j = alnId[mTag]
        for n, s, e in presences :
            coreSites[n][s-1:e] +=1
            mutations = matSites[n][s-1:e]
            for kk in mutations[mutations > 0] :
                k = (n, kk)
                if len(matrix[k][0]) and matrix[k][0][j] == '-' :
                    matrix[k][0][j] = '.'
                if len(matrix[k][1]) and matrix[k][1][j] == '-' :
                    matrix[k][1][j] = '.'
        for n, s, e, m in absences :
            coreSites[n][s-1:e] -=1
            mutations = matSites[n][s-1:e]
            for kk in mutations[mutations > 0] :
                k = (n, kk)
                if len(matrix[k][0]) and matrix[k][0][j] == '.' :
                    matrix[k][0][j] = '-'
                if len(matrix[k][1]) and matrix[k][1][j] == '.' :
                    matrix[k][1][j] = '-'
    pres = np.unique(np.concatenate(list(coreSites.values())), return_counts=True)
    pres = [pres[0][pres[0] > 0], pres[1][pres[0] > 0]]
    coreNum = len(alignments) * core
    for p, n in zip(*pres) :
        sys.stderr.write('#{2} {0} {1}\n'.format(p, n, '' if p > coreNum else '#'))

    missings = []
    coreBases = {'A':0, 'C':0, 'G':0, 'T':0}
    for n in sorted(coreSites) :
        sites = coreSites[n]
        for site, num in enumerate(sites) :
            cSite = (n, site+1)
            if num < coreNum and cSite in matrix and len(matrix[cSite][1]) > 0 :
                num = np.sum(matrix[cSite][1] != '-')
                matrix[cSite][0] = []
            if num < coreNum :
                matrix.pop(cSite, None)
                if len(missings) == 0 or missings[-1][0] != n or missings[-1][2] + 1 < cSite[1] :
                    missings.append([n, cSite[1], cSite[1]])
                else :
                    missings[-1][2] = cSite[1]
            else :
                b = refSeq[n][cSite[1]-1]
                if cSite in matrix and len(matrix[cSite][0]) :
                    matrix[cSite][0] = [ (b if s == '.' else s) for s in matrix[cSite][0]]
                else :
                    coreBases[b] = coreBases.get(b, 0) + 1
                    
    outputs = {}
    if matrixOut :
        outputs['matrix'] = prefix + '.matrix.gz'
        with uopen(prefix + '.matrix.gz', 'w') as fout :
            fout.write('## Constant_bases: {A} {C} {G} {T}\n'.format(**coreBases))
            for n in refSeq :
                fout.write('## Sequence_length: {0} {1}\n'.format(n, len(refSeq[n])))
            for region in missings :
                fout.write('## Missing_region: {0} {1} {2}\n'.format(*region))
            fout.write('\t'.join(['#Seq', '#Site'] + [ mTag for mTag, mFile in alignments ]) + '\n')
            for site in sorted(matrix) :
                bases = matrix[site]
                if len(bases[0]) :
                    fout.write('{0}\t{1}\t{2}\n'.format(site[0], site[1], '\t'.join(bases[0])))
                if len(bases[1]) :
                    fout.write('{0}\t{1}\t{2}\n'.format(site[0], site[1], '\t'.join(bases[1])))
    if alignmentOut :
        outputs['alignment'] = prefix + '.fasta.gz'
        sequences = []
        for (mTag, mFile), (presences, absences, mutations) in zip(alignments, res) :
            j = alnId[mTag]
            seq = { n:['-']*len(s) for n, s in refSeq.items() } if j > 0 else { n:list(s) for n, s in refSeq.items() }
            if j :
                for n, s, e in presences :
                    seq[n][s-1:e] = refSeq[n][s-1:e]
                for n, s, e, c in absences :
                    seq[n][s-1:e] = '-' * (e-s+1)
            for site in matrix :
                bases = matrix[site]
                if len(bases[0]) :
                    seq[site[0]][site[1]-1] = bases[0][j]
            sequences.append(seq)
        with uopen(prefix + '.fasta.gz', 'w') as fout :
            for id, n in enumerate(sorted(refSeq)) :
                if id :
                    fout.write('=\n')
                for (mTag, mFile), seq in zip(alignments, sequences) :
                    fout.write('>{0}:{1}\n{2}\n'.format(mTag, n, ''.join(seq[n])))
    return outputs
Exemplo n.º 8
0
def alignAgainst(data) :
    prefix, minimap2, db, (rtag, reference), (tag, query) = data
    try :
        qrySeq, qryQual = readFastq(query)
    except :
        return [tag, query]
    refSeq, refQual = readFastq(reference)
    proc = subprocess.Popen('{0} -c -t1 --frag=yes -A2 -B8 -O20,40 -E3,2 -r20 -g200 -p.000001 -N5000 -f1000,5000 -n2 -m30 -s30 -z200 -2K10m --heap-sort=yes --secondary=yes {1} {2}'.format(
                                minimap2, db, query).split(), stdout=subprocess.PIPE, universal_newlines=True)
    alignments = []
    for lineId, line in enumerate(proc.stdout) :
        part = line.strip().split('\t')
        part[1:4] = [int(p) for p in part[1:4]]
        part[6:11] = [int(p) for p in part[6:11]]
        part[11] = float(part[13][5:])
        part[12], part[13] = lineId, part[11]/part[10]
        part[14:17] = [[], [], []]
        alignments.append(part)
    proc.wait()
    
    deleteChain = {}
    nItem = len(alignments)
    
    alignments.sort(key=lambda x:x[:4])
    for i1, p1 in enumerate(alignments) :
        for i2 in xrange(i1+1, nItem) :
            p2 = alignments[i2]
            if p1[0] != p2[0] : break

            s, e = max(p1[2], p2[2]), min(p1[3], p2[3])
            if s > e+10 :
                break
            if (e-s) >= 0.9 * (p1[3]-p1[2]) and p2[13] - 0.1 >= p1[13] :
                deleteChain[p1[12]] = deleteChain.get(p1[12], set([])) | set([p2[12]])
            if (e-s) >= 0.9 * (p2[3]-p2[2]) and p1[13] - 0.1 >= p2[13] :
                deleteChain[p2[12]] = deleteChain.get(p2[12], set([])) | set([p1[12]])
    alignments.sort(key=lambda x:x[5:9])
    for i1, p1 in enumerate(alignments) :
        for i2 in xrange(i1+1, nItem) :
            p2 = alignments[i2]
            if p1[5] != p2[5] : break

            s, e = max(p1[7], p2[7]), min(p1[8], p2[8])
            if s > e+10 :
                break
            
            if (e-s) >= 0.9 * (p1[8]-p1[7]) and p2[13] - 0.05 >= p1[13] :
                deleteChain[p1[12]] = deleteChain.get(p1[12], set([])) | set([p2[12]])
            if (e-s) >= 0.9 * (p2[8]-p2[7]) and p1[13] - 0.05 >= p2[13] :
                deleteChain[p2[12]] = deleteChain.get(p2[12], set([])) | set([p1[12]])

    deleted = {}
    for p in sorted(alignments, key=lambda x:x[11], reverse=True) :
        id = p[12]
        if id in deleteChain :
            for jd in deleteChain[id] :
                if jd not in deleted :
                    deleted[id] = 1
                    break
    alignments = [p for p in alignments if p[12] not in deleted]
    
    # repeats in qry
    nItem = len(alignments)
    alignments.sort(key=lambda x:x[:4])
    for i1, p1 in enumerate(alignments) :
        for i2 in xrange(i1+1, nItem) :
            p2 = alignments[i2]
            if p1[0] != p2[0] : break
            s, e = max(p1[2], p2[2]), min(p1[3], p2[3])
            if e > s :
                p1[16].append([s, e])
                p2[16].append([s, e])
            else :
                break
    # repeats in ref
    alignments.sort(key=lambda x:x[5:9])
    for i1, p1 in enumerate(alignments) :
        for i2 in xrange(i1+1, nItem) :
            p2 = alignments[i2]
            if p1[5] != p2[5] : break
            s, e = max(p1[7], p2[7]), min(p1[8], p2[8])
            if e > s :
                p1[15].append([s, e])
                p2[15].append([s, e])
            else :
                break
    
    maskedRegion = {}
    refRepeat = []
    for p in alignments :
        # prepare a unique set of repeat region
        qryRepeat = []
        if len(p[16]) > 0 :
            qryRepeat.append(p[16][0])
            for pp in p[16][1:] :
                if pp[0] > qryRepeat[-1][1]+20 :
                    qryRepeat.append(pp)
                elif pp[1] > qryRepeat[-1][1]:
                    qryRepeat[-1][1] = pp[1]
        ref = [refSeq[p[5]], refQual[p[5]]]
        qry = [qrySeq[p[0]], qryQual[p[0]]]
        cigar = p[-1][5:]
        d = 1 if p[4] == '+' else -1
        if d < 0 :
            qryRepeat = [[q[1], q[0], -1, -1] for q in qryRepeat]
        else :
            qryRepeat = [[q[0], q[1], -1, -1] for q in reversed(qryRepeat)]

        mut = []
        alnSite = [p[7], p[2] if d > 0 else p[3]-1]
        for cl, ct in re.findall(r'(\d+)([MID])', cigar) :
            cl = int(cl)
            if ct == 'M' :
                # extract aligned sequences
                r = ref[0][alnSite[0]:alnSite[0]+cl]
                r1 = ref[1][alnSite[0]:alnSite[0]+cl]
                q = qry[0][alnSite[1]:alnSite[1]+cl] if d > 0 else rc(qry[0][(alnSite[1]-cl+1):(alnSite[1]+1)])
                q1 = qry[1][alnSite[1]:alnSite[1]+cl] if d > 0 else ''.join(reversed(qry[0][(alnSite[1]-cl+1):(alnSite[1]+1)]))

                e =[alnSite[0]+cl, alnSite[1]+cl*d]
                for qid in xrange(len(qryRepeat)-1, -1, -1) :
                    qr = qryRepeat[qid]
                    if d*qr[0] <= d*e[1] :
                        if qr[2] == -1 :
                            qr[2] = alnSite[0] + d*(qr[0] - alnSite[1])
                        if d*qr[1] <= d*e[1] :
                            qr[3] = alnSite[0] + d*(qr[1] - alnSite[1])
                            p[15].append(qr[2:])
                            del qryRepeat[qid]
                    else :
                        break
                for id, (rr, rr1, qq, qq1) in enumerate(np.array([list(r), list(r1), list(q), list(q1)]).T) :
                    if ord(rr1) < 43 or ord(qq1) < 43 :
                        maskedRegion[(p[5], alnSite[0]+id)] = 0
                    if rr != qq and rr != 'N' and qq != 'N' :
                        mut.append([alnSite[0]+id, alnSite[1]+id*d, rr, qq, p[4]])
                alnSite = e
            elif ct == 'I' :
                q = qry[0][alnSite[1]:alnSite[1]+cl] if d < 0 else rc(qry[0][(alnSite[1]-cl+1):(alnSite[1]+1)] )
                q1 = qry[1][alnSite[1]:alnSite[1]+cl] if d > 0 else ''.join(reversed(qry[0][(alnSite[1]-cl+1):(alnSite[1]+1)] ))
                
                e = alnSite[1] + cl*d
                for qid in xrange(len(qryRepeat)-1, -1, -1) :
                    qr = qryRepeat[qid]
                    if d*qr[0] <= d*e :
                        if qr[2] == -1 :
                            qr[2] = alnSite[0]
                        if d*qr[1] <= d*e :
                            qr[3] = alnSite[0]
                            p[15].append(qr[2:])
                            del qryRepeat[qid]
                    else :
                        break
                
                if ord(min(list(q1))) >= 43 :
                    mut.append([alnSite[0], min(alnSite[1], e), '.', '+' + q, p[4]])
                for site in xrange(alnSite[0], alnSite[0]+2) :
                    maskedRegion[(p[5], site)] = 0
                alnSite[1] = e
            elif ct == 'D' :
                r = ref[0][alnSite[0]:alnSite[0]+cl]
                r1 = ref[1][alnSite[0]:alnSite[0]+cl]
                if ord(min(list(r1))) >= 43 :
                    mut.append([alnSite[0], int(alnSite[1]+0.5*d), '.', '-' + r, p[4]])
                for site in xrange(alnSite[0], alnSite[0]+2) :
                    maskedRegion[(p[5], site)] = 0
                alnSite[0]+=cl
        p[14] = mut
        refRepeat.extend([ [p[5], pp[0], pp[1]] for pp in p[15] ])

    repeats = []
    if len(refRepeat) :
        refRepeat.sort()
        repeats = [refRepeat[0]]
        for p in refRepeat[1:] :
            if p[0] != repeats[-1][0] or p[1] - 20 > repeats[-1][2] :
                repeats.append(p)
            elif p[2] > repeats[-1][2] :
                repeats[-1][2] = p[2]

    for p in repeats :
        for site in xrange(p[1], p[2]) :
            maskedRegion[(p[0], site)] = 1

    repeats = []
    for cont, site in sorted(maskedRegion) :
        if len(repeats) == 0 or repeats[-1][0] != cont or repeats[-1][2]+1 < site :
            repeats.append([cont, site, site])
        else :
            repeats[-1][2] = site
  
    mutations = []
    alignments = [aln for aln in alignments if aln[9] >= 100]
    for aln in alignments :
        for m in aln[14] :
            if len(m[3]) == 1 :
                if (aln[5], m[0]) not in maskedRegion :
                    mutations.append([aln[5], aln[0]] + m)
            elif maskedRegion.get((aln[5], m[0]), 0) != 1 :
                if m[3].startswith('-') and maskedRegion.get((aln[5], m[0]+len(m[3])-2), 0) > 0 :
                    continue
                mutations.append([aln[5], aln[0]] + m)
    with uopen(prefix + '.gff.gz', 'w') as fout :
        fout.write('##gff-version 3\n')
        fout.write('## Reference: {0}\n'.format(reference))
        fout.write('## Query: {0}\n'.format(query))
        fout.write('## Tag: {0}\n'.format(tag))
        for aln in alignments :
            if aln[5] == aln[0] and aln[2] == aln[7] and aln[3] == aln[8] :
                fout.write('{0}\trefMapper\tmisc_feature\t{1}\t{2}\t{3}\t{4}\t.\t/inference="Self%20Alignments"\n'.format(
                    aln[5], aln[7]+1, aln[8], aln[9], aln[4], aln[0], aln[2]+1, aln[3], 
                ))
            else :
                fout.write('{0}\trefMapper\tmisc_feature\t{1}\t{2}\t{3}\t{4}\t.\t/inference="Aligned%20with%20{5}:{6}-{7}"\n'.format(
                    aln[5], aln[7]+1, aln[8], aln[9], aln[4], aln[0], aln[2]+1, aln[3], 
                ))
                
        for p in repeats :
            fout.write('{0}\trefMapper\tunsure\t{1}\t{2}\t.\t+\t.\t/inference="Uncertain%20base%20calling%20or%20ambigious%20alignment"\n'.format(
                p[0], p[1]+1, p[2]+1, 
            ))
        for mut in mutations :
            e1 = mut[2] if not mut[5].startswith('-') else mut[2] + len(mut[5]) - 2
            e2 = mut[3] if not mut[5].startswith('+') else mut[3] + len(mut[5]) - 2
            if len(mut[5]) > 26 :
                mut[5] = '{0}[{1}bps]'.format(mut[5][0], len(mut[5])-1)

            fout.write('{0}\trefMapper\tvariation\t{1}\t{2}\t.\t+\t.\t/replace="{7}";/compare="{3}:{4}-{5}:{8}";/origin="{6}"\n'.format(
                mut[0], mut[2]+1, e1+1, mut[1], mut[3]+1, e2+1, mut[4], mut[5], mut[6]
            ))

    return [tag, prefix + '.gff.gz']
Exemplo n.º 9
0
    def runMMseq(self, ref, qry):
        logger('Run MMSeqs starts')

        def parseMMSeq(fin, refseq, qryseq, min_id, min_cov, min_ratio):
            blastab = pd.read_csv(fin, sep='\t', header=None)
            blastab = blastab[blastab[2] >= min_id]
            qlen = blastab[0].apply(lambda r: len(qryseq[r]))
            rlen = blastab[1].apply(lambda r: len(refseq[r]))
            cigar = blastab[14].apply(lambda x: [[int(n) * 3, t] for n, t in re
                                                 .findall(r'(\d+)([A-Z])', x)])
            ref_sites = pd.concat([3 * (blastab[6] - 1) + 1, 3 * blastab[7]],
                                  keys=[0, 1],
                                  axis=1)
            d = ref_sites[1] - qlen
            d[d < 0] = 0

            def ending(x, y):
                x[-1][0] -= y

            np.vectorize(ending)(cigar, d)
            ref_sites[1] -= d

            direction = (blastab[8] < blastab[9])
            qry_sites = pd.concat([blastab[8], blastab[9] - d], axis=1)
            qry_sites[~direction] = pd.concat([blastab[8] - d, blastab[9]],
                                              axis=1)[~direction]

            blastab = pd.DataFrame(
                np.hstack([
                    blastab[[0, 1, 2]],
                    np.apply_along_axis(lambda x: x[1] - x[0] + 1, 1,
                                        ref_sites.values)[:, np.newaxis],
                    pd.DataFrame(np.zeros([blastab.shape[0], 2], dtype=int)),
                    ref_sites, qry_sites, blastab[[10, 11]],
                    qlen[:, np.newaxis], rlen[:, np.newaxis], cigar[:,
                                                                    np.newaxis]
                ]))
            return blastab[(blastab[3] >= min_cov)
                           & (blastab[3] >= blastab[12] * min_ratio)]

        tmpDir = os.path.join(self.dirPath, 'tmp')
        refNA = os.path.join(self.dirPath, 'refNA')
        qryNA = os.path.join(self.dirPath, 'qryNA')

        refCDS = os.path.join(self.dirPath, 'refCDS')
        qryAA = os.path.join(self.dirPath, 'qryAA')
        aaMatch = os.path.join(self.dirPath, 'aaMatch2')

        Popen('{0} createdb {1} {2} --dont-split-seq-by-len'.format(
            mmseqs, ref, refNA).split(),
              stdout=PIPE).communicate()
        Popen('{0} createdb {1} {2} --dont-split-seq-by-len'.format(
            mmseqs, qry, qryNA).split(),
              stdout=PIPE).communicate()
        Popen('{0} translatenucs {1} {2}'.format(mmseqs, qryNA, qryAA).split(),
              stdout=PIPE).communicate()
        for ite in range(9):
            if os.path.isdir(tmpDir):
                shutil.rmtree(tmpDir)
            p = Popen('{0} search {1} {2} {3} {4} -a --alt-ali 30 -s 6 --translation-table 11 --threads {5} --min-seq-id {6} -e 10 --cov-mode 2 -c {7}'.format(\
                mmseqs, qryAA, refNA, aaMatch, tmpDir, self.n_thread, self.min_id, self.min_ratio).split(), stdout=PIPE)
            p.communicate()
            if p.returncode == 0:
                break
            if ite > 2:
                Popen('{0} extractorfs {2} {3}'.format(mmseqs, qryAA, refNA,
                                                       refCDS).split(),
                      stdout=PIPE).communicate()
                p = Popen('{0} search {1} {2} {3} {4} -a --alt-ali 30 -s 6 --translation-table 11 --threads {5} --min-seq-id {6} -e 10 --cov-mode 2 -c {7}'.format(\
                    mmseqs, qryAA, refCDS, aaMatch, tmpDir, self.n_thread, self.min_id, self.min_ratio).split(), stdout=PIPE)
                p.communicate()
                if p.returncode == 0:
                    break
            time.sleep(1)
        Popen('{0} convertalis {1} {2} {3} {3}.tab --threads {4} --format-output'.format(\
            mmseqs, qryAA, refNA, aaMatch, self.n_thread).split() + ['query,target,pident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,raw,qlen,tlen,cigar'], stdout=PIPE).communicate()

        if not self.qrySeq:
            self.qrySeq, self.qryQual = readFastq(qry)
        if not self.refSeq:
            self.refSeq, self.refQual = readFastq(ref)
        blastab = parseMMSeq(open(aaMatch + '.tab'), self.refSeq, self.qrySeq,
                             self.min_id, self.min_cov, self.min_ratio)
        logger('Run MMSeqs finishes. Got {0} alignments'.format(
            blastab.shape[0]))
        return blastab
Exemplo n.º 10
0
    def runUBlast(self, ref, qry, nhits=6, frames='7'):
        logger('Run uBLAST starts')

        def parseUBlast(fin, refseq, qryseq, min_id, min_cov, min_ratio):
            blastab = pd.read_csv(fin, sep='\t', header=None)
            blastab[2] /= 100.
            blastab = blastab[blastab[2] >= min_id]
            blastab[3], blastab[4] = blastab[3] * 3, blastab[4] * 3

            qf, rf = blastab[0].str.rsplit(
                ':', 1, expand=True), blastab[1].str.rsplit(':',
                                                            1,
                                                            expand=True)
            if np.all(qf[0].str.isdigit()):
                qf[0] = qf[0].astype(int)
            if np.all(rf[0].str.isdigit()):
                rf[0] = rf[0].astype(int)
            blastab[0], qf = qf[0], qf[1].astype(int)
            blastab[1], rf = rf[0], rf[1].astype(int)
            blastab[6], blastab[
                7] = blastab[6] * 3 + qf - 3, blastab[7] * 3 + qf - 1
            blastab[14] = [[
                [3 * vv[0], vv[1]] for vv in v
            ] for v in map(getCIGAR, zip(blastab[15], blastab[14]))]

            blastab[12], blastab[13] = blastab[0].apply(lambda x: len(qryseq[
                str(x)])), blastab[1].apply(lambda x: len(refseq[str(x)]))

            rf3 = (rf <= 3)
            blastab.loc[rf3,
                        8], blastab.loc[rf3, 9] = blastab.loc[rf3, 8] * 3 + rf[
                            rf3] - 3, blastab.loc[rf3, 9] * 3 + rf[rf3] - 1
            blastab.loc[~rf3, 8], blastab.loc[
                ~rf3, 9] = blastab.loc[~rf3, 13] - (
                    blastab.loc[~rf3, 8] * 3 + rf[~rf3] - 3 -
                    3) + 1, blastab.loc[~rf3, 13] - (blastab.loc[~rf3, 9] * 3 +
                                                     rf[~rf3] - 3 - 1) + 1
            d = np.max([
                blastab[7] - blastab[12], blastab[9] - blastab[13],
                1 - blastab[9],
                np.zeros(blastab.shape[0], dtype=int)
            ],
                       axis=0)
            blastab[7] -= d

            def ending(x, y):
                x[-1][0] -= y

            np.vectorize(ending)(blastab[14], d)
            d[~rf3] *= -1
            blastab[9] -= d
            blastab = blastab[
                (blastab[7] - blastab[6] + 1 >= min_ratio * blastab[12])
                & (blastab[7] - blastab[6] + 1 >= min_cov)]
            return blastab.drop(columns=[15, 16])

        refAA = os.path.join(self.dirPath, 'refAA')
        qryAA = os.path.join(self.dirPath, 'qryAA')
        aaMatch = os.path.join(self.dirPath, 'aaMatch')

        if not self.qrySeq:
            self.qrySeq, self.qryQual = readFastq(qry)
        if not self.refSeq:
            self.refSeq, self.refQual = readFastq(ref)

        qryAASeq = transeq(self.qrySeq, frame='F')
        with open(qryAA, 'w') as fout:
            for n, ss in sorted(qryAASeq.items()):
                _, id, s = min([(len(s[:-1].split('X')), id, s)
                                for id, s in enumerate(ss)])
                fout.write('>{0}:{1}\n{2}\n'.format(n, id + 1, s))

        refAASeq = transeq(self.refSeq, frames)
        toWrite = []
        for n, ss in sorted(refAASeq.items()):
            for id, s in enumerate(ss):
                toWrite.append('>{0}:{1}\n{2}\n'.format(n, id + 1, s))

        blastab = []
        for id in xrange(5):
            with open(refAA, 'w') as fout:
                for line in toWrite[id::4]:
                    fout.write(line)

            ublast_cmd = '{usearch} -self -threads {n_thread} -db {refAA} -ublast {qryAA} -mid {min_id} -query_cov {min_ratio} -evalue 1 -accel 0.9 -maxhits {nhits} -userout {aaMatch} -ka_dbsize 5000000 -userfields query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+raw+ql+tl+qrow+trow+qstrand'.format(
                usearch=usearch,
                refAA=refAA,
                qryAA=qryAA,
                aaMatch=aaMatch,
                n_thread=self.n_thread,
                min_id=self.min_id * 100.,
                nhits=nhits,
                min_ratio=self.min_ratio)
            p = Popen(ublast_cmd.split(),
                      stderr=PIPE,
                      stdout=PIPE,
                      universal_newlines=True).communicate()
            if os.path.getsize(aaMatch) > 0:
                blastab.append(
                    parseUBlast(open(aaMatch), self.refSeq, self.qrySeq,
                                self.min_id, self.min_cov, self.min_ratio))
        blastab = pd.concat(blastab)
        logger('Run uBLAST finishes. Got {0} alignments'.format(
            blastab.shape[0]))
        return blastab
Exemplo n.º 11
0
    def runDiamond(self, ref, qry, nhits=10, frames='7'):
        logger('Run diamond starts')

        def parseDiamond(fin, refseq, qryseq, min_id, min_cov, min_ratio):
            blastab = []
            for line in fin:
                if line.startswith('@'):
                    continue
                part = line.strip().split('\t')
                if part[2] == '*': continue
                qn, qf = part[0].rsplit(':', 1)
                rn, rf, rx = part[2].rsplit(':', 2)
                rs = int(part[3]) + int(rx)
                ql, rl = len(qryseq[str(qn)]), len(refseq[str(rn)])
                qm = len(part[9])
                if qm * 3 < min_cov: continue
                cov_ratio = qm * 3. / ql
                if cov_ratio < min_ratio: continue
                cigar = [[int(n) * 3, t]
                         for n, t in re.findall(r'(\d+)([A-Z])', part[5])]
                cl = np.sum([c[0] for c in cigar])
                variation = float(part[12][5:]) * 3 if part[12].startswith(
                    'NM:') else float(re.findall('NM:i:(\d+)', line)[0]) * 3

                iden = 1 - round(variation / cl, 3)
                if iden < min_id: continue
                qf, rf = int(qf), int(rf)
                qs = int(part[18][5:]) if part[18].startswith('ZS:') else int(
                    re.findall('ZS:i:(\d+)', line)[0])

                rm = int(
                    np.sum([c[0] for c in cigar if c[1] in {'M', 'D'}]) / 3)
                if rf <= 3:
                    rs, r_e = rs * 3 + rf - 3, (rs + rm - 1) * 3 + rf - 1
                else:
                    rs, r_e = rl - (rs * 3 + rf - 6) + 1, rl - (
                        (rs + rm - 1) * 3 + rf - 4) + 1
                if qf <= 3:
                    qs, qe = qs * 3 + qf - 3, (qs + qm - 1) * 3 + qf - 1
                else:
                    qs, qe = ql - (qs * 3 + qf - 6) + 1, ql - (
                        (qs + qm - 1) * 3 + qf - 4) + 1
                    qs, qe, rs, r_e = qe, qs, r_e, rs
                    cigar = list(reversed(cigar))

                cd = [c[0] for c in cigar if c[1] != 'M']
                score = int(
                    part[14][5:]) if part[14].startswith('ZR:') else int(
                        re.findall('ZR:i:(\d+)', line)[0])
                blastab.append([
                    qn, rn, iden, cl,
                    int(variation - sum(cd)),
                    len(cd), qs, qe, rs, r_e, 0.0, score, ql, rl, cigar
                ])
            blastab = pd.DataFrame(blastab)
            blastab[[0, 1]] = blastab[[0, 1]].astype(str)
            return blastab

        refAA = os.path.join(self.dirPath, 'refAA')
        qryAA = os.path.join(self.dirPath, 'qryAA')
        aaMatch = os.path.join(self.dirPath, 'aaMatch')

        if not self.qrySeq:
            self.qrySeq, self.qryQual = readFastq(qry)
        if not self.refSeq:
            self.refSeq, self.refQual = readFastq(ref)

        qryAASeq = transeq(self.qrySeq, frame='F', transl_table=self.table_id)
        with open(qryAA, 'w') as fout:
            for n, ss in sorted(qryAASeq.items()):
                _, id, s = min([(len(s[:-1].split('X')), id, s)
                                for id, s in enumerate(ss)])
                fout.write('>{0}:{1}\n{2}\n'.format(n, id + 1, s))

        diamond_fmt = '{diamond} makedb --db {qryAA} --in {qryAA}'.format(
            diamond=diamond, qryAA=qryAA)
        p = Popen(diamond_fmt.split(),
                  stderr=PIPE,
                  stdout=PIPE,
                  universal_newlines=True).communicate()

        refAASeq = transeq(self.refSeq, frames, transl_table=self.table_id)
        toWrite = []
        for n, ss in sorted(refAASeq.items()):
            for id, s in enumerate(ss):
                cdss = re.findall('.{1000,}?X|.{1,1000}$', s + 'X')
                cdss[-1] = cdss[-1][:-1]
                cdsi = np.cumsum([0] + list(map(len, cdss[:-1])))
                for ci, cs in zip(cdsi, cdss):
                    if len(cs):
                        toWrite.append('>{0}:{1}:{2}\n{3}\n'.format(
                            n, id + 1, ci, cs))

        blastab = []
        for id in xrange(5):
            #logger('{0}'.format(id))
            with open(refAA, 'w') as fout:
                for line in toWrite[id::5]:
                    fout.write(line)
            diamond_cmd = '{diamond} blastp --no-self-hits --threads {n_thread} --db {refAA} --query {qryAA} --out {aaMatch} --id {min_id} --query-cover {min_ratio} --evalue 1 -k {nhits} --dbsize 5000000 --outfmt 101'.format(
                diamond=diamond,
                refAA=refAA,
                qryAA=qryAA,
                aaMatch=aaMatch,
                n_thread=self.n_thread,
                min_id=self.min_id * 100.,
                nhits=nhits,
                min_ratio=self.min_ratio * 100.)
            p = Popen(diamond_cmd.split(),
                      stdout=PIPE,
                      stderr=PIPE,
                      universal_newlines=True).communicate()
            if os.path.getsize(aaMatch) > 0:
                tab = parseDiamond(open(aaMatch), self.refSeq, self.qrySeq,
                                   self.min_id, self.min_cov, self.min_ratio)
                os.unlink(aaMatch)
            if tab is not None:
                blastab.append(tab)
        blastab = pd.concat(blastab)
        logger('Run diamond finishes. Got {0} alignments'.format(
            blastab.shape[0]))
        return blastab