예제 #1
0
def iter_readGFF(fname):
    seq, cds = {}, {}
    names = {}
    with uopen(fname) as fin:
        sequenceMode = False
        for line in fin:
            if line.startswith('#'):
                continue
            elif line.startswith('>'):
                sequenceMode = True
                name = line[1:].strip().split()[0]
                assert name not in seq, logger(
                    'Error: duplicated sequence name {0}'.format(name))
                seq[name] = [fname, []]
            elif sequenceMode:
                seq[name][1].extend(line.strip().split())
            else:
                part = line.strip().split('\t')
                if len(part) > 2:
                    name = re.findall(r'locus_tag=([^;]+)', part[8])
                    if len(name) == 0:
                        parent = re.findall(r'Parent=([^;]+)', part[8])
                        if len(parent) and parent[0] in names:
                            name = names[parent[0]]
                    if len(name) == 0:
                        name = re.findall(r'Name=([^;]+)', part[8])
                    if len(name) == 0:
                        name = re.findall(r'ID=([^;]+)', part[8])

                    if part[2] == 'CDS':
                        assert len(name) > 0, logger(
                            'Error: CDS has no name. {0}'.format(line))
                        #          source_file, seqName, Start,       End,      Direction, hash, Sequences
                        cds[name[0]] = [
                            fname, part[0],
                            int(part[3]),
                            int(part[4]), part[6], 0, ''
                        ]
                    else:
                        ids = re.findall(r'ID=([^;]+)', part[8])
                        if len(ids):
                            names[ids[0]] = name

    for n in seq:
        seq[n][1] = ''.join(seq[n][1]).upper()
    for n in cds:
        c = cds[n]
        try:
            c[6] = seq[c[1]][1][(c[2] - 1):c[3]]
            if c[4] == '-':
                c[6] = rc(c[6])
            if not checkCDS(n, c[6]):
                c[6] = ''
            else:
                c[5] = int(hashlib.sha1(c[6].encode('utf-8')).hexdigest(), 16)
        except:
            c[6] = ''

    return seq, cds
예제 #2
0
파일: align.py 프로젝트: nickp60/EToKi
def alignAgainst(data) :
    prefix, minimap2, db, (rtag, reference), (tag, query) = data
    try :
        qrySeq, qryQual = readFastq(query)
    except :
        return [tag, query]
    refSeq, refQual = readFastq(reference)
    proc = subprocess.Popen('{0} -c -t1 --frag=yes -A2 -B8 -O20,40 -E3,2 -r20 -g200 -p.000001 -N5000 -f1000,5000 -n2 -m30 -s30 -z200 -2K10m --heap-sort=yes --secondary=yes {1} {2}'.format(
                                minimap2, db, query).split(), stdout=subprocess.PIPE, universal_newlines=True)
    alignments = []
    for lineId, line in enumerate(proc.stdout) :
        part = line.strip().split('\t')
        part[1:4] = [int(p) for p in part[1:4]]
        part[6:11] = [int(p) for p in part[6:11]]
        part[11] = float(part[13][5:])
        part[12], part[13] = lineId, part[11]/part[10]
        part[14:17] = [[], [], []]
        alignments.append(part)
    proc.wait()
    
    deleteChain = {}
    nItem = len(alignments)
    
    alignments.sort(key=lambda x:x[:4])
    for i1, p1 in enumerate(alignments) :
        for i2 in xrange(i1+1, nItem) :
            p2 = alignments[i2]
            if p1[0] != p2[0] : break

            s, e = max(p1[2], p2[2]), min(p1[3], p2[3])
            if s > e+10 :
                break
            if (e-s) >= 0.9 * (p1[3]-p1[2]) and p2[13] - 0.1 >= p1[13] :
                deleteChain[p1[12]] = deleteChain.get(p1[12], set([])) | set([p2[12]])
            if (e-s) >= 0.9 * (p2[3]-p2[2]) and p1[13] - 0.1 >= p2[13] :
                deleteChain[p2[12]] = deleteChain.get(p2[12], set([])) | set([p1[12]])
    alignments.sort(key=lambda x:x[5:9])
    for i1, p1 in enumerate(alignments) :
        for i2 in xrange(i1+1, nItem) :
            p2 = alignments[i2]
            if p1[5] != p2[5] : break

            s, e = max(p1[7], p2[7]), min(p1[8], p2[8])
            if s > e+10 :
                break
            
            if (e-s) >= 0.9 * (p1[8]-p1[7]) and p2[13] - 0.05 >= p1[13] :
                deleteChain[p1[12]] = deleteChain.get(p1[12], set([])) | set([p2[12]])
            if (e-s) >= 0.9 * (p2[8]-p2[7]) and p1[13] - 0.05 >= p2[13] :
                deleteChain[p2[12]] = deleteChain.get(p2[12], set([])) | set([p1[12]])

    deleted = {}
    for p in sorted(alignments, key=lambda x:x[11], reverse=True) :
        id = p[12]
        if id in deleteChain :
            for jd in deleteChain[id] :
                if jd not in deleted :
                    deleted[id] = 1
                    break
    alignments = [p for p in alignments if p[12] not in deleted]
    
    # repeats in qry
    nItem = len(alignments)
    alignments.sort(key=lambda x:x[:4])
    for i1, p1 in enumerate(alignments) :
        for i2 in xrange(i1+1, nItem) :
            p2 = alignments[i2]
            if p1[0] != p2[0] : break
            s, e = max(p1[2], p2[2]), min(p1[3], p2[3])
            if e > s :
                p1[16].append([s, e])
                p2[16].append([s, e])
            else :
                break
    # repeats in ref
    alignments.sort(key=lambda x:x[5:9])
    for i1, p1 in enumerate(alignments) :
        for i2 in xrange(i1+1, nItem) :
            p2 = alignments[i2]
            if p1[5] != p2[5] : break
            s, e = max(p1[7], p2[7]), min(p1[8], p2[8])
            if e > s :
                p1[15].append([s, e])
                p2[15].append([s, e])
            else :
                break
    
    maskedRegion = {}
    refRepeat = []
    for p in alignments :
        # prepare a unique set of repeat region
        qryRepeat = []
        if len(p[16]) > 0 :
            qryRepeat.append(p[16][0])
            for pp in p[16][1:] :
                if pp[0] > qryRepeat[-1][1]+20 :
                    qryRepeat.append(pp)
                elif pp[1] > qryRepeat[-1][1]:
                    qryRepeat[-1][1] = pp[1]
        ref = [refSeq[p[5]], refQual[p[5]]]
        qry = [qrySeq[p[0]], qryQual[p[0]]]
        cigar = p[-1][5:]
        d = 1 if p[4] == '+' else -1
        if d < 0 :
            qryRepeat = [[q[1], q[0], -1, -1] for q in qryRepeat]
        else :
            qryRepeat = [[q[0], q[1], -1, -1] for q in reversed(qryRepeat)]

        mut = []
        alnSite = [p[7], p[2] if d > 0 else p[3]-1]
        for cl, ct in re.findall(r'(\d+)([MID])', cigar) :
            cl = int(cl)
            if ct == 'M' :
                # extract aligned sequences
                r = ref[0][alnSite[0]:alnSite[0]+cl]
                r1 = ref[1][alnSite[0]:alnSite[0]+cl]
                q = qry[0][alnSite[1]:alnSite[1]+cl] if d > 0 else rc(qry[0][(alnSite[1]-cl+1):(alnSite[1]+1)])
                q1 = qry[1][alnSite[1]:alnSite[1]+cl] if d > 0 else ''.join(reversed(qry[0][(alnSite[1]-cl+1):(alnSite[1]+1)]))

                e =[alnSite[0]+cl, alnSite[1]+cl*d]
                for qid in xrange(len(qryRepeat)-1, -1, -1) :
                    qr = qryRepeat[qid]
                    if d*qr[0] <= d*e[1] :
                        if qr[2] == -1 :
                            qr[2] = alnSite[0] + d*(qr[0] - alnSite[1])
                        if d*qr[1] <= d*e[1] :
                            qr[3] = alnSite[0] + d*(qr[1] - alnSite[1])
                            p[15].append(qr[2:])
                            del qryRepeat[qid]
                    else :
                        break
                for id, (rr, rr1, qq, qq1) in enumerate(np.array([list(r), list(r1), list(q), list(q1)]).T) :
                    if ord(rr1) < 43 or ord(qq1) < 43 :
                        maskedRegion[(p[5], alnSite[0]+id)] = 0
                    if rr != qq and rr != 'N' and qq != 'N' :
                        mut.append([alnSite[0]+id, alnSite[1]+id*d, rr, qq, p[4]])
                alnSite = e
            elif ct == 'I' :
                q = qry[0][alnSite[1]:alnSite[1]+cl] if d < 0 else rc(qry[0][(alnSite[1]-cl+1):(alnSite[1]+1)] )
                q1 = qry[1][alnSite[1]:alnSite[1]+cl] if d > 0 else ''.join(reversed(qry[0][(alnSite[1]-cl+1):(alnSite[1]+1)] ))
                
                e = alnSite[1] + cl*d
                for qid in xrange(len(qryRepeat)-1, -1, -1) :
                    qr = qryRepeat[qid]
                    if d*qr[0] <= d*e :
                        if qr[2] == -1 :
                            qr[2] = alnSite[0]
                        if d*qr[1] <= d*e :
                            qr[3] = alnSite[0]
                            p[15].append(qr[2:])
                            del qryRepeat[qid]
                    else :
                        break
                
                if ord(min(list(q1))) >= 43 :
                    mut.append([alnSite[0], min(alnSite[1], e), '.', '+' + q, p[4]])
                for site in xrange(alnSite[0], alnSite[0]+2) :
                    maskedRegion[(p[5], site)] = 0
                alnSite[1] = e
            elif ct == 'D' :
                r = ref[0][alnSite[0]:alnSite[0]+cl]
                r1 = ref[1][alnSite[0]:alnSite[0]+cl]
                if ord(min(list(r1))) >= 43 :
                    mut.append([alnSite[0], int(alnSite[1]+0.5*d), '.', '-' + r, p[4]])
                for site in xrange(alnSite[0], alnSite[0]+2) :
                    maskedRegion[(p[5], site)] = 0
                alnSite[0]+=cl
        p[14] = mut
        refRepeat.extend([ [p[5], pp[0], pp[1]] for pp in p[15] ])

    repeats = []
    if len(refRepeat) :
        refRepeat.sort()
        repeats = [refRepeat[0]]
        for p in refRepeat[1:] :
            if p[0] != repeats[-1][0] or p[1] - 20 > repeats[-1][2] :
                repeats.append(p)
            elif p[2] > repeats[-1][2] :
                repeats[-1][2] = p[2]

    for p in repeats :
        for site in xrange(p[1], p[2]) :
            maskedRegion[(p[0], site)] = 1

    repeats = []
    for cont, site in sorted(maskedRegion) :
        if len(repeats) == 0 or repeats[-1][0] != cont or repeats[-1][2]+1 < site :
            repeats.append([cont, site, site])
        else :
            repeats[-1][2] = site
  
    mutations = []
    alignments = [aln for aln in alignments if aln[9] >= 100]
    for aln in alignments :
        for m in aln[14] :
            if len(m[3]) == 1 :
                if (aln[5], m[0]) not in maskedRegion :
                    mutations.append([aln[5], aln[0]] + m)
            elif maskedRegion.get((aln[5], m[0]), 0) != 1 :
                if m[3].startswith('-') and maskedRegion.get((aln[5], m[0]+len(m[3])-2), 0) > 0 :
                    continue
                mutations.append([aln[5], aln[0]] + m)
    with uopen(prefix + '.gff.gz', 'w') as fout :
        fout.write('##gff-version 3\n')
        fout.write('## Reference: {0}\n'.format(reference))
        fout.write('## Query: {0}\n'.format(query))
        fout.write('## Tag: {0}\n'.format(tag))
        for aln in alignments :
            if aln[5] == aln[0] and aln[2] == aln[7] and aln[3] == aln[8] :
                fout.write('{0}\trefMapper\tmisc_feature\t{1}\t{2}\t{3}\t{4}\t.\t/inference="Self%20Alignments"\n'.format(
                    aln[5], aln[7]+1, aln[8], aln[9], aln[4], aln[0], aln[2]+1, aln[3], 
                ))
            else :
                fout.write('{0}\trefMapper\tmisc_feature\t{1}\t{2}\t{3}\t{4}\t.\t/inference="Aligned%20with%20{5}:{6}-{7}"\n'.format(
                    aln[5], aln[7]+1, aln[8], aln[9], aln[4], aln[0], aln[2]+1, aln[3], 
                ))
                
        for p in repeats :
            fout.write('{0}\trefMapper\tunsure\t{1}\t{2}\t.\t+\t.\t/inference="Uncertain%20base%20calling%20or%20ambigious%20alignment"\n'.format(
                p[0], p[1]+1, p[2]+1, 
            ))
        for mut in mutations :
            e1 = mut[2] if not mut[5].startswith('-') else mut[2] + len(mut[5]) - 2
            e2 = mut[3] if not mut[5].startswith('+') else mut[3] + len(mut[5]) - 2
            if len(mut[5]) > 26 :
                mut[5] = '{0}[{1}bps]'.format(mut[5][0], len(mut[5])-1)

            fout.write('{0}\trefMapper\tvariation\t{1}\t{2}\t.\t+\t.\t/replace="{7}";/compare="{3}:{4}-{5}:{8}";/origin="{6}"\n'.format(
                mut[0], mut[2]+1, e1+1, mut[1], mut[3]+1, e2+1, mut[4], mut[5], mut[6]
            ))

    return [tag, prefix + '.gff.gz']
예제 #3
0
def write_output(prefix, prediction, genomes, clust_ref, old_prediction):
    predictions, alleles = {}, {}

    allele_file = open('{0}.allele.fna'.format(prefix), 'w')
    prediction = pd.read_csv(prediction, sep='\t', header=None).values
    for part in prediction:
        #with open(prediction) as fin :
        #for line in fin :
        #part = line.strip().split()
        if part[0] not in alleles:
            alleles[part[0]] = {clust_ref[part[0]]: 1}
            allele_file.write('>{0}_{1}\n{2}\n'.format(part[0], 1,
                                                       clust_ref[part[0]]))

        if part[9] < part[10]:
            l, r, d = min(part[7] - 1,
                          part[9] - 1), min(part[12] - part[8],
                                            part[13] - part[10]), 1
        else:
            l, r, d = min(part[7] - 1,
                          part[13] - part[9]), min(part[12] - part[8],
                                                   part[10] - 1), -1
        if l <= 6 and part[7] - l == 1:
            part[7], part[9] = part[7] - l, part[9] - l * d
        else:
            ll = (part[7] - 1) % 3
            if ll > 0:
                part[7], part[9] = part[7] + 3 - ll, part[9] + (3 - ll) * d
        if r <= 6 and part[8] + r == part[12]:
            part[8], part[10] = part[8] + r, part[10] + r * d
        else:
            rr = (part[12] - part[8]) % 3
            if rr > 0:
                part[8], part[10] = part[8] - 3 + rr, part[10] - (3 + rr) * d

        if part[9] < part[10]:
            part[9:12] = part[9], part[10], '+'
        else:
            part[9:12] = part[10], part[9], '-'

        if part[4] not in predictions:
            predictions[part[4]] = []
        elif predictions[part[4]][-1][2] == part[2]:
            prev = predictions[part[4]][-1]
            if prev[5] == part[5] and part[7] - prev[8] < 500:
                if part[11] == '+' and part[9] - prev[10] < 500:
                    prev[8], prev[10] = part[8], part[10]
                    continue
                elif part[11] == '-' and prev[9] - part[10] < 500:
                    prev[8], prev[9] = part[8], part[9]
                    continue
            predictions[part[4]][-1][1], part[1] = -1, -1
        predictions[part[4]].append(part)

    op = ['', 0, []]
    with open('{0}.EToKi.gff'.format(prefix), 'w') as fout:
        for gid, (g, predict) in enumerate(predictions.items()):
            predict.sort(key=itemgetter(5, 9, 10))
            for pid, pred in enumerate(predict):
                if pred[1] == -1 or (pred[10] - pred[9] + 1) <= 0.8 * pred[12]:
                    cds, allele_id = 'fragment:{0:.2f}%'.format(
                        (pred[10] - pred[9] + 1) * 100 / pred[12]), 'uncertain'
                    start, stop = pred[9:11]
                else:
                    s, e = pred[9:11]
                    if pred[11] == '+':
                        s2, e2 = s - min(int(3 * ((s - 1) / 3)), 60), e + min(
                            3 * int((pred[13] - e) / 3), 600)
                        seq = genomes[pred[5]][1][(s2 - 1):e2]
                        lp, rp = s - s2, e2 - e
                    else:
                        s2, e2 = s - min(int(3 * ((s - 1) / 3)), 600), e + min(
                            3 * int((pred[13] - e) / 3), 60)
                        seq = rc(genomes[pred[5]][1][(s2 - 1):e2])
                        rp, lp = s - s2, e2 - e

                    seq2 = seq[(lp):(len(seq) - rp)]
                    if seq2 not in alleles[pred[0]]:
                        if pred[3] == pred[0] and pred[7] == 1 and pred[
                                8] == pred[12]:
                            alleles[pred[0]][seq2] = len(alleles[pred[0]]) + 1
                        else:
                            alleles[pred[0]][seq2] = 'LowQ{0}'.format(
                                len(alleles[pred[0]]) + 1)
                        allele_id = str(alleles[pred[0]][seq2])
                        allele_file.write('>{0}_{1}\n{2}\n'.format(
                            pred[0], allele_id, seq2))
                    else:
                        allele_id = str(alleles[pred[0]][seq2])

                    frames = sorted(set([0, len(seq) % 3]))
                    for frame, aa_seq in zip(
                            frames,
                            transeq({'n': seq},
                                    transl_table='starts',
                                    frame=','.join(
                                        [str(f + 1) for f in frames]))['n']):
                        cds = 'CDS'
                        s0, s1 = aa_seq.find('M', int(lp / 3),
                                             int(lp / 3 + 30)), aa_seq.rfind(
                                                 'M', 0, int(lp / 3))
                        start = s0 if s0 >= 0 else s1
                        if start < 0:
                            cds, start = 'nostart', int(lp / 3)
                        stop = aa_seq.find('X', start)
                        if 0 <= stop < lp / 3 + 30:
                            s0 = aa_seq.find('M', stop, int(lp / 3 + 30))
                            if s0 >= 0:
                                start = s0
                                stop = aa_seq.find('X', start)
                        if stop < 0:
                            cds = 'nostop'
                        elif (stop - start + 1) * 3 <= 0.8 * pred[12]:
                            cds = 'premature stop:{0:.2f}%'.format(
                                (stop - start + 1) * 300 / pred[12])

                        if cds == 'CDS':
                            if pred[11] == '+':
                                start, stop = s2 + start * 3 + frame, s2 + stop * 3 + 2 + frame
                            else:
                                start, stop = e2 - stop * 3 - 2 - frame, e2 - start * 3 - frame
                            break
                        else:
                            start, stop = s, e
                            if frame > 0:
                                cds = 'frameshift'

                if pred[5] != op[0]:
                    op = [pred[5], 0, old_prediction.get(pred[5], [])]
                old_tag = []
                for k in xrange(op[1], len(op[2])):
                    opd = op[2][k]
                    if opd[2] < start:
                        op[1] = k + 1
                    elif opd[1] > stop:
                        break
                    elif opd[3] != pred[11]:
                        continue
                    ovl = min(opd[2], stop) - max(opd[1], start) + 1
                    if ovl >= 300 or ovl >= 0.6 * (
                            opd[2] - opd[1] + 1) or ovl >= 0.6 * (stop -
                                                                  start + 1):
                        frame = min((opd[1] - start) % 3, (opd[2] - stop) % 3)
                        if frame == 0:
                            old_tag.append('{0}:{1}-{2}'.format(*opd))

                fout.write(
                    '{0}\t{1}\tEToKi-ortho\t{2}\t{3}\t.\t{4}\t.\tID={5};{12}inference=ortholog group:{6},allele ID:{7},matched region:{8}-{9}{10}{11}\n'
                    .format(
                        pred[5],
                        'CDS' if cds == 'CDS' else 'pseudogene',
                        start,
                        stop,
                        pred[11],
                        '{0}_{1}_{2}'.format(prefix, gid, pid),
                        pred[0],
                        allele_id,
                        s,
                        e,
                        '' if pred[0] == pred[3] else
                        ',structure variant group:' + pred[3],
                        '' if cds == 'CDS' else ';pseudogene=' + cds,
                        '' if len(old_tag) == 0 else 'locus_tag={0};'.format(
                            ','.join(old_tag)),
                    ))
    allele_file.close()
    return
예제 #4
0
def iter_map_bsn(data):
    prefix, clust, id, taxon, seq, params = data
    gfile, out_prefix = '{0}.{1}.genome'.format(prefix, id), '{0}.{1}'.format(
        prefix, id)
    with open(gfile, 'w') as fout:
        for n, s in seq:
            fout.write('>{0}\n{1}\n'.format(n, s))

    blastab, overlap = uberBlast(
        '-r {0} -q {1} -f -m -o --blastn --ublast --min_id {2} --min_cov {3} -t 2 -s 2 -e 0,3'
        .format(gfile, clust, params['match_identity'] - 0.1,
                params['match_frag_len']).split())
    os.unlink(gfile)

    groups = []
    groups2 = {}
    ids = np.zeros(np.max(blastab.T[15]) + 1, dtype=bool)
    for tab in blastab:
        if tab[16][1] >= params['match_identity'] and tab[16][2] >= max(
                params['match_prop'] * tab[12],
                params['match_len']) and tab[16][2] >= max(
                    params['match_prop2'] * tab[12], params['match_len2']):
            ids[tab[15]] = True
            if len(tab[16]) <= 4:
                groups.append(tab[:2].tolist() + tab[16][:2] +
                              [None, 0, [tab[:16]]])
            else:
                length = tab[7] - tab[6] + 1
                if tab[2] >= params['match_identity'] and length >= max(
                        params['match_prop'] * tab[12],
                        params['match_len']) and length >= max(
                            params['match_prop2'] * tab[12],
                            params['match_len2']):
                    groups.append(tab[:2].tolist() +
                                  [tab[11], tab[2], None, 0, [tab[:16]]])
                if tab[16][3] not in groups2:
                    groups2[tab[16][3]] = tab[:2].tolist() + tab[16][:2] + [
                        None, 0, [[]] * (len(tab[16]) - 3)
                    ]
                x = [i for i, t in enumerate(tab[16][3:]) if t == tab[15]][0]
                groups2[tab[16][3]][6][x] = tab[:16]
        else:
            tab[2] = -1
    groups.extend(list(groups2.values()))
    overlap = overlap[ids[overlap.T[0]] & ids[overlap.T[1]], :2]
    convA, convB = np.tile(-1,
                           np.max(blastab.T[15]) + 1), np.tile(
                               -1,
                               np.max(blastab.T[15]) + 1)
    seq = dict(seq)
    for id, group in enumerate(groups):
        group[4] = np.zeros(group[6][0][12], dtype=np.uint8)
        group[4].fill(45)
        group[5] = id
        group[6] = np.array(group[6])
        if group[6].shape[0] == 1:
            convA[group[6].T[15].astype(int)] = id
        else:
            convB[group[6].T[15].astype(int)] = id
        max_sc = 0
        for tab in group[6]:
            matchedSeq = seq[tab[1]][tab[8] -
                                     1:tab[9]] if tab[8] < tab[9] else rc(
                                         seq[tab[1]][tab[9] - 1:tab[8]])
            ms, i, f, sc = [], 0, 0, [0, 0, 0]
            for s, t in re.findall(r'(\d+)([A-Z])', tab[14]):
                s = int(s)
                if t == 'M':
                    ms.append(matchedSeq[i:i + s])
                    i += s
                    sc[f] += s
                elif t == 'D':
                    i += s
                    f = (f - s) % 3
                else:
                    ms.append('-' * s)
                    f = (f + s) % 3
            group[4][tab[6] - 1:tab[7]] = np.array(list(
                ''.join(ms))).view(asc2int).astype(np.uint8)
            max_sc += max(sc[0], sc[f])
        group[2] = max_sc
    overlap = np.vstack([np.vstack([m, n]).T[(m>=0) & (n >=0)] for m in (convA[overlap.T[0]], convB[overlap.T[0]]) \
                         for n in (convA[overlap.T[1]], convB[overlap.T[1]]) ] + [np.vstack([convA, convB]).T[(convA >= 0) & (convB >=0)]])

    np.savez_compressed(out_prefix + '.bsn.npz',
                        bsn=np.array(groups, dtype=object),
                        ovl=overlap)
    return out_prefix