示例#1
0
 def collectRecords(self, corrected):
     # type: (List[Segment]) -> List[LineExtender.Record]
     sys.stdout.trace("Collecting records", corrected)
     read_bounds = dict()
     records = dict()  # type: Dict[Segment, LineExtender.Record]
     good_reads = set()
     for seg in corrected:
         sys.stdout.trace("Oppa initial:", seg)
         seg = seg.expandLeft(params.k)
         sys.stdout.trace("Alignments relevant for", seg,
                          list(self.dot_plot.allInter(seg)))
         for al in self.dot_plot.allInter(seg):
             seg1 = al.matchingSequence().mapSegUp(al.seg_from.contig, seg)
             line = al.seg_from.contig  # type:NewLine
             for seg_correct in line.correct_segments.allInter(al.seg_from):
                 for seg_resolved in line.completely_resolved.allInter(
                         seg_correct):
                     if seg_resolved in records:
                         continue
                     if seg_resolved.right == len(line):
                         next_start = len(line)
                     else:
                         next = line.completely_resolved.find(
                             line.asSegment().suffix(
                                 pos=seg_resolved.right), 1)
                         if next is None:
                             next_start = len(line)
                         else:
                             next_start = next.left
                     next_start = min(next_start, len(line) - 200)
                     focus = line.segment(
                         max(seg_resolved.left,
                             min(seg_resolved.right - params.k, seg1.left)),
                         min(seg_correct.right, next_start + params.k))
                     als = list(line.getRelevantAlignmentsFor(focus))
                     reads = ContigStorage()
                     for al in als:
                         reads.add(al.seg_from.contig)
                     als = list(
                         self.aligner.localAlign(reads.unique(),
                                                 ContigStorage([line])))
                     final_als = []
                     sys.stdout.trace("Focus:", focus, seg_resolved)
                     sys.stdout.trace(als)
                     for al in als:
                         if al.seg_to.contig == line.rc:
                             al = al.rc
                         if al.seg_to.interSize(focus) >= params.k - 100:
                             final_als.append(al)
                     sys.stdout.trace(final_als)
                     sys.stdout.trace("Finished realignment of reads")
                     records[seg_resolved] = self.createRecord(
                         seg_resolved, next_start, seg_correct, final_als,
                         good_reads, read_bounds)
     records = list(records.values())  # type: List[LineExtender.Record]
     return records
示例#2
0
def evaluatePI(dir, contigs_file, initial_file, ref_file):
    basic.ensure_dir_existance(dir)
    CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False)
    initial = ContigStorage().loadFromFasta(open(initial_file, "r"), False)
    ref = ContigStorage().loadFromFasta(open(ref_file, "r"), False)
    segs = []
    for al in aligner.overlapAlign(initial.unique(), contigs):
        if basic.isCanonocal(al.seg_to.contig.id):
            segs.append(al.seg_to)
        else:
            segs.append(al.rc.seg_to)
    segs = sorted(segs, key=lambda seg: basic.Normalize(seg.contig.id))
    interesting = dict()
    print "Interesting segments:"
    for contig in contigs:
        interesting[contig.id] = [contig.asSegment()]
    for contig, segit in itertools.groupby(segs, lambda seg: seg.contig):
        csegs = SegmentStorage().addAll(segit)
        csegs.mergeSegments()
        csegs = csegs.reverse(contig)
        interesting[contig.id] = list(csegs)
        print list(csegs)
    print "Analysis of contigs"
    scorer = Scorer()
    for al in aligner.localAlign(contigs.unique(), ref):
        print al
        for seg in interesting[al.seg_from.contig.id]:
            if al.seg_from.expand(500).contains(
                    seg) or al.seg_from.interSize(seg) > 40000:
                tmp_al = al.reduce(query=al.seg_from.cap(seg))
                scorer.polyshMatching(tmp_al.matchingSequence(),
                                      params.score_counting_radius)
                print tmp_al.seg_from, tmp_al.seg_to, str(events)
    print ""
    print "Analysis of initial"
    for al in aligner.overlapAlign(initial, ref):
        scorer.polyshMatching(al.matchingSequence(),
                              params.score_counting_radius)
        print al.seg_from, al.seg_to, str(events)
示例#3
0
def ExtendShortContigs(contigs, reads, aligner, polisher, read_dump):
    # type: (ContigStorage, ReadCollection, Aligner, Polisher, str) -> None
    sys.stdout.info("Extending short lines")
    short_contigs = ContigStorage()
    als = dict() # type: Dict[str, List[AlignmentPiece]]
    for contig in contigs.unique():
        if len(contig) < params.k + 500:
            short_contigs.add(contig)
            als[contig.id] = []
            als[contig.rc.id] = []

    if read_dump is not None:
        sys.stdout.trace("Using flye read dump file to extend short contigs")
        relevant_reads = RelevantReadsFromDump(read_dump, short_contigs, reads)
        for contig in short_contigs:
            for al in aligner.overlapAlign(relevant_reads[contig.id], ContigStorage([contig])):
                als[al.seg_to.contig.id].append(al)
                als[al.seg_to.contig.rc.id].append(al.rc)
    else:
        sys.stdout.trace("Realigning all reads to extend short contigs")
        for al in aligner.overlapAlign(reads, short_contigs):
            if al.seg_to.left <= 20 and al.rc.seg_to.left <= 20:
                added = False
                for i, al1 in enumerate(als[al.seg_to.contig.id]):
                    if al1.seg_from.contig.id == al.seg_from.contig.id:
                        added = True
                        if al.percentIdentity() > al1.percentIdentity():
                            als[al.seg_to.contig.id][i] = al
                            als[al.seg_to.contig.rc.id][i] = al.rc
                        break
                if not added:
                    als[al.seg_to.contig.id].append(al)
                    als[al.seg_to.contig.rc.id].append(al.rc)
    for contig in short_contigs.unique():
        if len(als[contig.id]) > 0:
            tmp_contig, new_als = polisher.polishEnd(als[contig.id], params.reliable_coverage, max_extension=params.l - len(contig))
            r = len(tmp_contig) - len(contig)
            tmp_contig, new_als = polisher.polishEnd([al.rc for al in new_als], params.reliable_coverage, max_extension=params.l - len(contig))
            l = len(tmp_contig) - len(contig) - r
        else:
            tmp_contig, new_als = contig, als[contig.id]
            l = 0
            r = 0
#        if l > params.k / 2 and r > params.k / 2:
#            tmp_contig.seq = tmp_contig.seq[l - params.k / 2:-r + params.k / 2]
#        else:
#            tmp_contig.seq = tmp_contig.seq[max(0, l - params.k):-max(1, r - params.k)]
        if len(tmp_contig) > params.k + 500:
            sys.stdout.info("Prolonged contig", contig.id, "for", l, "and", r, "nucleotides from left and right")
            contigs.add(Contig(tmp_contig.rc.seq, contig.id))
        else:
            sys.stdout.warn("Could not prolong contig", contig.id, "enough. Removing it.")
            contigs.remove(contig)
示例#4
0
    dir = sys.argv[1]
    extra_params = sys.argv[4:]
    CreateLog(dir)
    dd = DirDistributor(dir)
    aligner = Aligner(dd)
    polisher = Polisher(aligner, dd)
    reads = ContigStorage().loadFromFasta(open(reads_file, "r"),
                                          num_names=False)
    ref = ContigStorage().loadFromFasta(open(consensus_file, "r"),
                                        num_names=False)
    if "accurate" in extra_params:
        res = []
        als = sorted(aligner.overlapAlign(reads, ref),
                     key=lambda al: al.seg_to.contig.id)
        for rid, rals in itertools.groupby(als,
                                           key=lambda al: al.seg_to.contig.id):
            if basic.isCanonocal(rid):
                contig = ref[rid]
                corrected_seq = polisher.polishSegment(
                    contig.asSegment(), list(rals)).seg_from.Seq()
                res.append(Contig(corrected_seq, rid))
    else:
        res = polisher.polishMany(reads, list(ref.unique()))
    res_file = os.path.join(dir, "res.fasta")
    rf = open(res_file, "w")
    for c in res:
        SeqIO.write(c, rf, "fasta")
    rf.close()
    aligner.align_files(res_file, [reads_file], 16, "pacbio", "overlap",
                        os.path.join(dir, "res.sam"))
示例#5
0
from common import basic, params
from common.basic import CreateLog

from alignment.align_tools import Aligner, DirDistributor
from common.line_align import Scorer
from common.sequences import ContigStorage

if __name__ == "__main__":
    basic.ensure_dir_existance(sys.argv[1])
    CreateLog(sys.argv[1])
    reads = ContigStorage().loadFromFile(sys.argv[2])
    contigs = ContigStorage().loadFromFile(sys.argv[3])
    scorer = Scorer()
    dd = DirDistributor(sys.argv[1])
    aligner = Aligner(dd)
    for read in reads.unique():
        print "Processing read", read
        als = [
            scorer.polyshAlignment(al, params.alignment_correction_radius)
            for al in aligner.localAlign([read], contigs)
        ]
        for al1 in als:
            for al2 in als:
                if al1.seg_to.contig == al2.seg_to.contig:
                    continue
                print al1, "vs", al2
                scorer.scoreInCorrectSegments(al1,
                                              al1.seg_to.contig.asSegment(),
                                              al2,
                                              al2.seg_to.contig.asSegment())
示例#6
0
def main(contigs_file, contig_name, reads_file, dir, k, initial_reads1, initial_reads2):
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False)
#    contig = contigs[contig_name].asSegment().prefix(length=2000).asContig()
    contig = contigs[contig_name]
    reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False)
    reads1 = ContigStorage()
    reads2 = ContigStorage()
    cnt = 0
    for read in reads.unique():
        cnt += 1
#        if cnt % 2 == 0:
        if read.id in initial_reads1:
            reads1.add(read)
        elif read.id in initial_reads2:
            reads2.add(read)
    polisher = Polisher(aligner, dd)
    contig1 = contig
    contig2 = contig
    scorer = Scorer()
    for i in range(3):
        diff = 0
        print "Iteration", i
        als1 = fixAlDir(aligner.overlapAlign(reads1.unique(), ContigStorage([contig])), contig)
        als2 = fixAlDir(aligner.overlapAlign(reads2.unique(), ContigStorage([contig])), contig)
        contig1 = Contig(polisher.polishSmallSegment(contig.asSegment(), als1).seg_from.Seq(), "1")
        contig2 = Contig(polisher.polishSmallSegment(contig.asSegment(), als2).seg_from.Seq(), "2")
        al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next()
        als1 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig1])), contig1)
        als1 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als1)
        als2 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig2])), contig2)
        als2 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als2)
        als1 = sorted(als1, key = lambda al: al.seg_from.contig.id)
        als2 = sorted(als2, key = lambda al: al.seg_from.contig.id)
        reads1 = ContigStorage()
        reads2 = ContigStorage()
        dp = scorer.accurateScore(al.matchingSequence(), 10) #1 - al.percentIdentity()
        als_map = dict()
        for al in als1:
            als_map[al.seg_from.contig.id] = [al]
        for al in als2:
            if al.seg_from.contig.id in als_map:
                als_map[al.seg_from.contig.id].append(al)
        com_res = []
        diffs = []
        for tmp_als in als_map.values():
            if len(tmp_als) != 2:
                continue
            al1 = tmp_als[0]
            al2 = tmp_als[1]
            print al1, al2
            assert al1.seg_from.contig == al2.seg_from.contig
            pi1 = scorer.accurateScore(al1.matchingSequence(), 10) # al1.percentIdentity()
            pi2 = scorer.accurateScore(al2.matchingSequence(), 10) # al2.percentIdentity()
            com_res.append((al1, al2, pi1 - pi2))
            diffs.append(pi1 - pi2)
        diffs = sorted(diffs)
        th1 = diffs[len(diffs) / 4]
        th2 = diffs[len(diffs) * 3 / 4]
        print "Thresholds:", th1, th2
        for al1, al2, diff in com_res:
            if diff < th1:
                reads1.add(al1.seg_from.contig)
            elif diff > th2:
                reads2.add(al2.seg_from.contig)
#           if pi1 > pi2 + dp / 4:
#               reads1.add(al1.seg_from.contig)
#           elif pi2 > pi1 + dp / 4:
#               reads2.add(al2.seg_from.contig)
#           diff += abs(pi1 - pi2)
        print float(diff) / len(als1), len(reads1) / 2, len(reads2) / 2
    al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next()
    print al
    print "\n".join(al.asMatchingStrings2())
    for read in reads1:
        if read.id in initial_reads1:
            sys.stdout.write(read.id + " ")
    print ""
    for read in reads2:
        if read.id in initial_reads2:
            sys.stdout.write(read.id + " ")
    print ""
    contig1 = prolong(aligner, polisher, contig1, reads1)
    contig2 = prolong(aligner, polisher, contig2, reads2)
    contig1.id = "1"
    contig2.id = "2"
    out = open(os.path.join(dir, "copies.fasta"), "w")
    SeqIO.write(contig1, out, "fasta")
    SeqIO.write(contig2, out, "fasta")
    out.close()
    out = open(os.path.join(dir, "reads1.fasta"), "w")
    for read in reads1.unique():
        SeqIO.write(read, out, "fasta")
    out.close()
    out = open(os.path.join(dir, "reads2.fasta"), "w")
    for read in reads2.unique():
        SeqIO.write(read, out, "fasta")
    out.close()
    print "Finished"
示例#7
0
def main(contigs_file, contig_name, reads_file, dir, k):
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False)
    contig = contigs[contig_name]
    contigs = ContigStorage()
    contigs.add(contig)
    reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False)
    als = list(aligner.localAlign(reads.unique(), contigs))
    tmp = []
    for al in als:
        if al.seg_to.contig != contig:
            al = al.rc
        tmp.append(al)
    als = tmp
    als = sorted(als,
                 key=lambda al: al.seg_to.left / 50 * 1000000 + al.seg_to.right
                 - al.seg_to.left)
    counts = dict()
    for al in als:
        counts[al.seg_from.contig.id] = 0
    for al in als:
        if len(al) > k:
            counts[al.seg_from.contig.id] += 1
    w = 20
    f = open(os.path.join(dir, "reads.fasta"), "w")
    over = set()
    inter = set()
    for al in als:
        if len(al) < k:
            continue
        inter.add(basic.Normalize(al.seg_from.contig.id))
        if not al.contradictingRTC():
            over.add(basic.Normalize(al.seg_from.contig.id))
        m = al.matchingSequence(True)
        tmp = []
        for i in range(len(contig) / w + 1):
            tmp.append([])
        for a, b in m.matches:
            tmp[b / w].append((a, b))
        for i in range(len(contig) / w):
            if i + 1 < len(tmp) and len(tmp[i + 1]) > 0:
                tmp[i].append(tmp[i + 1][0])
        for i in range(len(contig) / w):
            seg = contig.segment(i * w, i * w + w)
            if al.seg_to.inter(seg):
                if al.seg_to.left >= seg.left and al.seg_from.left > params.bad_end_length:
                    sys.stdout.write("B")
                elif al.seg_to.right <= seg.right and al.rc.seg_from.left > params.bad_end_length:
                    sys.stdout.write("E")
                else:
                    if len(tmp[i]) == 0:
                        sys.stdout.write("*")
                    else:
                        a = tmp[i][-1][0] - tmp[i][0][0]
                        b = tmp[i][-1][1] - tmp[i][0][1]
                        if a - b > 30:
                            sys.stdout.write("I")
                        elif a - b > 15:
                            sys.stdout.write("i")
                        elif a - b < -30:
                            sys.stdout.write("D")
                        elif a - b < -15:
                            sys.stdout.write("d")
                        else:
                            sys.stdout.write(
                                str(min(8,
                                        max(a, b) + 1 - len(tmp[i]))))
            else:
                sys.stdout.write("*")
        print " ", al.seg_from.contig.id, counts[
            al.seg_from.contig.id], al.contradictingRTC()
    print inter
    for rid in inter:
        SeqIO.write(reads[rid], f, "fasta")
        print rid, reads[rid]
    f.close()
    f = open(os.path.join(dir, "reads_over.fasta"), "w")
    for rid in over:
        SeqIO.write(reads[rid], f, "fasta")
    f.close()