Пример #1
0
def splitSegKmeans(aligner, seg, mult, all_reads_list):
    polisher = Polisher(aligner, aligner.dir_distributor)
    all_reads = ContigStorage()
    base = seg.asContig()
    tmp = []
    rtv = readsToVectors(aligner, all_reads_list, base)
    kmeans = KMeans(n_clusters=mult, precompute_distances=True)
    recs = list(rtv.values())
    result = kmeans.fit_predict(X=[rec.v for rec in recs])
    print result
    clusters = dict()
    for i, c in enumerate(result):
        if c not in clusters:
            clusters[c] = []
        clusters[c].append(recs[i].al)
    for c in clusters.values():
        print str(c), ":", len(c)
    split_contigs = []
    split_reads = []
    for c in clusters.values():
        split_contigs.append(
            Contig(
                polisher.polishSmallSegment(base.asSegment(),
                                            c).seg_from.Seq(),
                str(len(split_contigs))))
        split_reads.append([al.seg_from.contig for al in c])
    maxpi = 1
    for i in range(mult):
        for j in range(mult):
            if i == j:
                sys.stdout.write("1.0 ")
                continue
            al = aligner.overlapAlign([split_contigs[i]],
                                      ContigStorage([split_contigs[j]
                                                     ])).next()
            sys.stdout.write(str(al.percentIdentity()) + " ")
            maxpi = max(maxpi, al.percentIdentity())
        print ""
    print "Maxpi:", maxpi
    if maxpi < 0.985:
        return zip(split_contigs, split_reads)
    else:
        return None
Пример #2
0
def readsToVectors(aligner, reads_list, base):
    als = []
    rtv = dict()
    polisher = Polisher(aligner, aligner.dir_distributor)
    for al in fixAlDir(aligner.overlapAlign(reads_list, ContigStorage([base])),
                       base):
        if len(al.seg_to) < len(base) - 100:
            continue
        else:
            als.append(al)
            rtv[al.seg_from.contig.id] = ReadRecord(al).extend(toVector(al))
    reads_list = [al.seg_from.contig for al in als]
    bases = [base]
    for base_al1, base_al2, base_al3 in zip(als[0::3], als[1::3], als[2::3]):
        base_candidate = Contig(
            polisher.polishSmallSegment(
                base.asSegment(),
                [base_al1, base_al2, base_al3]).seg_from.Seq(),
            str(len(bases)))
        rtr_als = []
        read_ids = set()
        #        base_candidate = base_al.seg_from.asContig()
        for al in fixAlDir(
                aligner.overlapAlign(reads_list,
                                     ContigStorage([base_candidate])),
                base_candidate):
            if len(al.seg_to) < len(base_candidate) - 100:
                continue
            else:
                rtr_als.append(al)
                read_ids.add(al.seg_from.contig.id)
        if len(read_ids) == len(als):
            bases.append(base_candidate)
            for al in rtr_als:
                rtv[al.seg_from.contig.id].extend(toVector(al))
            if len(bases) > 10:
                break
    for rec in rtv.values():
        print rec.read.id, len(rec.v), rec.v
    return rtv
Пример #3
0
def main(contigs_file, contig_name, reads_file, dir, k, initial_reads1, initial_reads2):
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False)
#    contig = contigs[contig_name].asSegment().prefix(length=2000).asContig()
    contig = contigs[contig_name]
    reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False)
    reads1 = ContigStorage()
    reads2 = ContigStorage()
    cnt = 0
    for read in reads.unique():
        cnt += 1
#        if cnt % 2 == 0:
        if read.id in initial_reads1:
            reads1.add(read)
        elif read.id in initial_reads2:
            reads2.add(read)
    polisher = Polisher(aligner, dd)
    contig1 = contig
    contig2 = contig
    scorer = Scorer()
    for i in range(3):
        diff = 0
        print "Iteration", i
        als1 = fixAlDir(aligner.overlapAlign(reads1.unique(), ContigStorage([contig])), contig)
        als2 = fixAlDir(aligner.overlapAlign(reads2.unique(), ContigStorage([contig])), contig)
        contig1 = Contig(polisher.polishSmallSegment(contig.asSegment(), als1).seg_from.Seq(), "1")
        contig2 = Contig(polisher.polishSmallSegment(contig.asSegment(), als2).seg_from.Seq(), "2")
        al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next()
        als1 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig1])), contig1)
        als1 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als1)
        als2 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig2])), contig2)
        als2 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als2)
        als1 = sorted(als1, key = lambda al: al.seg_from.contig.id)
        als2 = sorted(als2, key = lambda al: al.seg_from.contig.id)
        reads1 = ContigStorage()
        reads2 = ContigStorage()
        dp = scorer.accurateScore(al.matchingSequence(), 10) #1 - al.percentIdentity()
        als_map = dict()
        for al in als1:
            als_map[al.seg_from.contig.id] = [al]
        for al in als2:
            if al.seg_from.contig.id in als_map:
                als_map[al.seg_from.contig.id].append(al)
        com_res = []
        diffs = []
        for tmp_als in als_map.values():
            if len(tmp_als) != 2:
                continue
            al1 = tmp_als[0]
            al2 = tmp_als[1]
            print al1, al2
            assert al1.seg_from.contig == al2.seg_from.contig
            pi1 = scorer.accurateScore(al1.matchingSequence(), 10) # al1.percentIdentity()
            pi2 = scorer.accurateScore(al2.matchingSequence(), 10) # al2.percentIdentity()
            com_res.append((al1, al2, pi1 - pi2))
            diffs.append(pi1 - pi2)
        diffs = sorted(diffs)
        th1 = diffs[len(diffs) / 4]
        th2 = diffs[len(diffs) * 3 / 4]
        print "Thresholds:", th1, th2
        for al1, al2, diff in com_res:
            if diff < th1:
                reads1.add(al1.seg_from.contig)
            elif diff > th2:
                reads2.add(al2.seg_from.contig)
#           if pi1 > pi2 + dp / 4:
#               reads1.add(al1.seg_from.contig)
#           elif pi2 > pi1 + dp / 4:
#               reads2.add(al2.seg_from.contig)
#           diff += abs(pi1 - pi2)
        print float(diff) / len(als1), len(reads1) / 2, len(reads2) / 2
    al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next()
    print al
    print "\n".join(al.asMatchingStrings2())
    for read in reads1:
        if read.id in initial_reads1:
            sys.stdout.write(read.id + " ")
    print ""
    for read in reads2:
        if read.id in initial_reads2:
            sys.stdout.write(read.id + " ")
    print ""
    contig1 = prolong(aligner, polisher, contig1, reads1)
    contig2 = prolong(aligner, polisher, contig2, reads2)
    contig1.id = "1"
    contig2.id = "2"
    out = open(os.path.join(dir, "copies.fasta"), "w")
    SeqIO.write(contig1, out, "fasta")
    SeqIO.write(contig2, out, "fasta")
    out.close()
    out = open(os.path.join(dir, "reads1.fasta"), "w")
    for read in reads1.unique():
        SeqIO.write(read, out, "fasta")
    out.close()
    out = open(os.path.join(dir, "reads2.fasta"), "w")
    for read in reads2.unique():
        SeqIO.write(read, out, "fasta")
    out.close()
    print "Finished"
Пример #4
0
def splitSeg(aligner, seg, mult, all_reads_list):
    all_reads = ContigStorage()
    base = seg.asContig()
    tmp = []
    for al in fixAlDir(
            aligner.overlapAlign(all_reads_list, ContigStorage([base])), base):
        if len(al.seg_to) < len(base) - 100:
            continue
        all_reads.add(al.seg_from.contig)
        tmp.append(al.seg_from.contig)
    all_reads_list = tmp
    split_reads = []
    split_contigs = []
    for i in range(mult):
        split_reads.append([])
        split_contigs.append(base)
    cnt = 0
    for read in all_reads_list:
        split_reads[cnt % mult].append(read)
    polisher = Polisher(aligner, aligner.dir_distributor)
    for i in range(10):
        print "Iteration", i
        split_contigs = []
        for reads in split_reads:
            tmp_als = fixAlDir(
                aligner.overlapAlign(reads, ContigStorage([base])), base)
            split_contigs.append(
                Contig(
                    polisher.polishSmallSegment(base.asSegment(),
                                                tmp_als).seg_from.Seq(),
                    str(len(split_contigs))))
        bestals = dict()
        for read in all_reads_list:
            bestals[read.id] = None
        for contig in split_contigs:
            for al in fixAlDir(
                    aligner.overlapAlign(all_reads_list,
                                         ContigStorage([contig])), contig):
                if len(al.seg_to) < len(base) - 100:
                    continue
                if al.seg_from.contig.id not in bestals:
                    print bestals.keys()
                    print al
                if bestals[al.seg_from.contig.
                           id] is None or al.percentIdentity() > bestals[
                               al.seg_from.contig.id].percentIdentity():
                    bestals[al.seg_from.contig.id] = al


#            als.append(fixAlDir(aligner.overlapAlign(all_reads_list, ContigStorage([contig])), contig))
#            als[-1] = sorted(als[-1], key = lambda al: al.seg_from.contig.id)
        for i in range(mult):
            split_reads[i] = []
        for rid in bestals:
            al = bestals[rid]
            if al is None:
                print "Warning: no alignment for read", rid
            else:
                split_reads[int(al.seg_to.contig.id)].append(
                    al.seg_from.contig)
        print " ".join(map(str, map(len, split_reads)))
    maxpi = 0
    print "pi matrix:"
    for i in range(mult):
        for j in range(mult):
            al = aligner.overlapAlign([split_contigs[i]],
                                      ContigStorage([split_contigs[j]
                                                     ])).next()
            sys.stdout.write(str(al.percentIdentity()) + " ")
            maxpi = max(maxpi, al.percentIdentity())
        print ""
    print "Maxpi:", maxpi
    if maxpi < 0.985:
        return zip(split_contigs, split_reads)
    else:
        return None