Exemplo n.º 1
0
 def test2(self):
     dataset = TestDataset("abcdefgcijklmCDEFGHInopqr")
     dname = dataset.addDisjointig(
         "abcdefgcijklmCDEFGHInopqrabcdefgcijklmCDEFGHInopqr".upper())
     name1 = dataset.addContig("abcde")
     name2 = dataset.addContig("klmCDE")
     dataset.generateReads(4, 20, True)
     lines, dp, reads = dataset.genAll(self.aligner)
     UniqueMarker(self.aligner).markAllUnique(lines, reads)
     line1 = lines[name1]
     line2 = lines[name2]
     extender = LineExtender(self.aligner, None, lines.disjointigs, dp)
     extender.updateAllStructures(
         itertools.chain.from_iterable(line.completely_resolved
                                       for line in lines))
     # extender.updateAllStructures(list(line1.correct_segments))
     print line1, line2
     print str(line1.correct_segments)
     print str(line1.completely_resolved)
     print str(line2.correct_segments)
     print str(line2.completely_resolved)
     assert str(line1.correct_segments
                ) == "ReadStorage+:[C0_abcde[550:3850]]", str(
                    line1.correct_segments)
     assert str(
         line1.completely_resolved
     ) == "ReadStorage+:[C0_abcde[550:3000], C0_abcde[3300:3845]]", str(
         line1.completely_resolved)
     assert str(line2.correct_segments
                ) == "ReadStorage+:[C1_klmCDE[550:4395]]", str(
                    line2.correct_segments)
     assert str(
         line2.completely_resolved
     ) == "ReadStorage+:[C1_klmCDE[851:3549], C1_klmCDE[3851:4395]]", str(
         line2.completely_resolved)
Exemplo n.º 2
0
 def test1(self):
     dataset = TestDataset("abcdefghijklmCDEFGHInopqr")
     dname = dataset.addDisjointig(
         "abcdefghijklmCDEFGHInopqrabcdefghijklmCDEFGHInopqr".upper())
     name1 = dataset.addContig("abcde")
     name2 = dataset.addContig("klmCDE")
     dataset.generateReads(4, 20, True)
     lines, dp, reads = dataset.genAll(self.aligner)
     UniqueMarker(self.aligner).markAllUnique(lines, reads)
     line1 = lines[name1]
     line2 = lines[name2]
     extender = LineExtender(self.aligner, None, lines.disjointigs, dp)
     extender.updateAllStructures(list(line1.correct_segments))
     print str(line1.correct_segments), str(line1.completely_resolved), str(
         line2.correct_segments), str(line2.completely_resolved)
     assert str(
         line1.correct_segments) == "ReadStorage+:[C0_abcde[0:2200]]", str(
             line1.correct_segments)
     assert str(line1.completely_resolved
                ) == "ReadStorage+:[C0_abcde[0:2098]]", str(
                    line1.completely_resolved)
     assert str(
         line2.correct_segments) == "ReadStorage+:[C1_klmCDE[0:2749]]", str(
             line2.correct_segments)
     assert str(line2.completely_resolved
                ) == "ReadStorage+:[C1_klmCDE[0:2649]]", str(
                    line2.completely_resolved)
Exemplo n.º 3
0
 def testManual(self):
     dataset = TestDataset("abcdefghijklmCDEFGHInopqr")
     dname = dataset.addDisjointig("abcdefghijklmCDEFGHInopqr".upper())
     name1 = dataset.addContig("abcde")
     name2 = dataset.addContig("klmCDE")
     dataset.generateReads(4, 5, True)
     # dataset.saveStructure(TokenWriter(sys.stdout))
     lines, dp, reads = dataset.genAll(self.aligner)
     # UniqueMarker().markAllUnique(lines, dp)
     line1 = lines[name1]
     line1.correct_segments.add(line1.asSegment())
     line1.completely_resolved.add(line1.asSegment())
     line2 = lines[name2]
     line2.correct_segments.add(line2.asSegment())
     line2.completely_resolved.add(line2.asSegment())
     extender = LineExtender(self.aligner, None, lines.disjointigs, dp)
     res = extender.attemptCleanResolution(line1.asSegment())
     assert str(
         res[0][1]
     ) == "[(R2_bcde[0:2200-0]->C0_abcde[550:2750-0]:0.97), (R3_bcde[4:2192-0]->C0_abcde[553:2750-0]:0.97), (R4_cdef[0:1657]->C0_abcde[1100:2750-0]:0.96), (R5_cdef[0:1656]->C0_abcde[1100:2750-0]:0.96)]", str(
         res[0][1])
     assert str(
         res[1][1]
     ) == "[(R24_mCDE[0:2201-0]->C1_klmCDE[1100:3298-0]:0.97), (R25_mCDE[0:2194-0]->C1_klmCDE[1100:3298-0]:0.97), (R27_CDEF[0:1658]->C1_klmCDE[1651:3298-0]:0.96)]", str(
         res[1][1])
Exemplo n.º 4
0
 def testCase(self, instance):
     # type: (list[str]) -> None
     dataset = TestDataset(instance[0], mutation_rate=0.01)
     dname = dataset.addDisjointig(instance[0] + instance[0].upper())
     dataset.generateReads(int(instance[1]), 25, True)
     ethalon = int(instance[2])
     for s in instance[3:]:
         dataset.addContig(s)
     lines, dp, reads = dataset.genAll(self.aligner)
     UniqueMarker(self.aligner).markAllUnique(lines, reads)
     knotter = LineMerger(
         lines, Polisher(self.aligner, self.aligner.dir_distributor), dp)
     extender = LineExtender(self.aligner, knotter, lines.disjointigs, dp)
     extender.updateAllStructures(
         itertools.chain.from_iterable(line.completely_resolved
                                       for line in lines))
     while True:
         stop = True
         for line_id in list(lines.items.keys()):
             if line_id not in lines.items:
                 continue
             line = lines[line_id]
             dp.printAll(sys.stdout)
             extended = extender.processLine(line)
             if extended:
                 stop = False
         if stop:
             break
     print " ".join([
         str(dataset.translateBack(line, self.aligner))
         for line in lines.unique()
     ])
     print[line.circular for line in lines.unique()]
     breaks = 0
     for line in lines.unique():
         if not line.circular:
             breaks += 1
     assert breaks == ethalon, str(breaks) + " " + str(ethalon)
Exemplo n.º 5
0
 def test1(self):
     dataset = TestDataset("abcdefghijklmCDEFGHInopqr", mutation_rate=0.01)
     dname = dataset.addDisjointig("abcdefghijklmCDEFGHInopqrabcd".upper())
     name1 = dataset.addContig("abcde")
     name2 = dataset.addContig("klmCDE")
     dataset.generateReads(5, 25, True)
     lines, dp, reads = dataset.genAll(self.aligner)
     UniqueMarker(self.aligner).markAllUnique(lines, reads)
     line1 = lines[name1]
     line2 = lines[name2]
     knotter = LineMerger(
         lines, Polisher(self.aligner, self.aligner.dir_distributor), dp)
     extender = LineExtender(self.aligner, knotter, lines.disjointigs, dp)
     print "New iteration results"
     print dataset.translateBack(line1,
                                 self.aligner), dataset.translateBack(
                                     line2, self.aligner)
     extender.updateAllStructures(
         itertools.chain.from_iterable(line.completely_resolved
                                       for line in lines))
     while True:
         stop = True
         for line_id in list(lines.items.keys()):
             if line_id not in lines.items:
                 continue
             line = lines[line_id]
             dp.printAll(sys.stdout)
             extended = extender.processLine(line)
             if extended:
                 stop = False
         if stop:
             break
     print " ".join([
         str(dataset.translateBack(line, self.aligner))
         for line in lines.unique()
     ])
     print[line.circular for line in lines.unique()]
Exemplo n.º 6
0
def assemble(args, bin_path):
    params.bin_path = bin_path
    start = time.time()
    cl_params = Params().parse(args)
    ref = ContigStorage()
    if cl_params.test:
        cl_params.reads_file = os.path.dirname(__file__)  + "/../../test_dataset/reads.fasta"
        cl_params.genome_size = 30000
        cl_params.dir = os.path.dirname(__file__)  + "/../../test_results"
        ref.loadFromFile(os.path.dirname(__file__)  + "/../../test_dataset/axbctbdy.fasta", False)
    if cl_params.debug:
        params.save_alignments = True
    cl_params.check()
    CreateLog(cl_params.dir)
    sys.stdout.info("Command line:", " ".join(cl_params.args))
    sys.stdout.info("Started")
    if cl_params.debug:
        sys.stdout.info("Version:", subprocess.check_output(["git", "rev-parse", "HEAD"]))
        sys.stdout.info("Modifications:")
        print subprocess.check_output(["git", "diff"])
    sys.stdout.info("Preparing initial state")
    if cl_params.debug:
        save_handler = SaveHandler(os.path.join(cl_params.dir, "saves"))
    else:
        save_handler = None
    if cl_params.load_from is not None:
        # tmp = cl_params.focus
        sys.stdout.info("Loading initial state from saves")
        cl_params, aligner, contigs, reads, disjointigs, lines, dot_plot = loadAll(TokenReader(open(cl_params.load_from, "r")))
        cl_params.parse(args)
        # cl_params.focus = tmp
        knotter = LineMerger(lines, Polisher(aligner, aligner.dir_distributor), dot_plot)
        extender = LineExtender(aligner, knotter, disjointigs, dot_plot)
        dot_plot.printAll(sys.stdout)
        printState(lines)
    else:
        aligner = Aligner(DirDistributor(cl_params.alignmentDir()))
        polisher = Polisher(aligner, aligner.dir_distributor)

        reads = CreateReadCollection(cl_params.reads_file, cl_params.cut_reads, cl_params.downsample)


        if cl_params.contigs_file is None:
            sys.stdout.info("Running Flye")
            assembly_dir = os.path.join(cl_params.dir, "assembly_initial")
            reads_file = os.path.join(cl_params.dir, "actual_reads.fasta")
            reads.print_fasta(open(reads_file, "w"))
            subprocess.check_call([os.path.join(params.bin_path, "flye"), "--meta", "-o", assembly_dir, "-t", str(cl_params.threads), "--" + params.technology + "-raw", reads_file, "--genome-size", str(cl_params.genome_size), "--min-overlap", str(params.k)])
            cl_params.set_flye_dir(assembly_dir, cl_params.mode)
        elif len(cl_params.disjointigs_file_list) == 0:
            assembly_dir = os.path.join(cl_params.dir, "assembly_initial")
            reads_file = os.path.join(cl_params.dir, "actual_reads.fasta")
            reads.print_fasta(open(reads_file, "w"))
            disjointigs_file = constructDisjointigs(reads, params.expected_size, assembly_dir)
            # graph_file, contigs_file, disjointigs_file, rep_dir, graph_file_after, contigs_file_after = parseFlyeDir(assembly_dir)
            cl_params.disjointigs_file_list.append(disjointigs_file)
            params.min_contra_for_break = 8

        disjointigs = CreateDisjointigCollection(cl_params.disjointigs_file_list, cl_params.dir, aligner, reads)

        all_unique = cl_params.init_file is not None
        contigs = CreateContigCollection(cl_params.graph_file, cl_params.contigs_file, cl_params.min_cov, aligner, polisher, reads, cl_params.force_unique, all_unique)

        if cl_params.autoKL:
            adjustKL(aligner, reads, contigs)

        if cl_params.init_file is None:
            ExtendShortContigs(contigs, reads, aligner, polisher, cl_params.read_dump)
            lines = CreateLineCollection(cl_params.dir, aligner, contigs, disjointigs, reads, cl_params.split)
        else:
            lines = LoadLineCollection(cl_params.dir, cl_params.init_file, aligner, contigs, disjointigs, reads, polisher)

        sys.stdout.info("Constructing dot plot")
        dot_plot = LineDotPlot(lines, aligner)
        dot_plot.construct(aligner)
        # dot_plot.printAll(sys.stdout)

        sys.stdout.info("Updating sequences and resolved segments.")
        knotter = LineMerger(lines, Polisher(aligner, aligner.dir_distributor), dot_plot)
        extender = LineExtender(aligner, knotter, disjointigs, dot_plot)
        extender.updateAllStructures(itertools.chain.from_iterable(line.completely_resolved for line in lines))
        for line in list(lines.unique()): # type: NewLine
            line.completely_resolved.mergeSegments()
            if len(line.completely_resolved) == 0:
                lines.removeLine(line)
        if cl_params.debug:
            sys.stdout.info( "Saving initial state")
            try:
                writer = save_handler.getWriter()
                sys.stdout.info("Save details:", writer.info)
                saveAll(writer, cl_params, aligner, contigs, reads, disjointigs, lines, dot_plot)
            except Exception as e:
                _, _, tb = sys.exc_info()
                sys.stdout.warn("Could not write save")
                traceback.print_tb(tb)
                sys.stdout.INFO( "Message:", e.message)

    sys.stdout.trace( "Disjointig alignments")
    for line in lines:
        sys.stdout.trace( line.disjointig_alignments)
    sys.stdout.info("Starting expanding alignment-consensus loop")

    EACL(aligner, cl_params, contigs, disjointigs, dot_plot, extender, lines, reads, save_handler)

    dot_plot.printAll(sys.stdout)

    sys.stdout.trace( "Final result:")
    lines.printToFasta(open(os.path.join(cl_params.dir, "lines.fasta"), "w"))
    lines.printKnottedToFasta(open(os.path.join(cl_params.dir, "assembly.fasta"), "w"))
    printState(lines)
    sys.stdout.info("Finished")
    secs = int(time.time() - start)
    days = secs / 60 / 60 / 24
    hours = secs / 60 / 60 % 24
    mins = secs / 60 % 60
    sys.stdout.info("Finished in %d days, %d hours, %d minutes" % (days, hours, mins))
    if cl_params.test:
        passed = False
        for al in aligner.dotplotAlign(lines, ref):
            if len(al) > len(al.seg_to.contig) - 3000:
                passed = True
                break
        if passed:
            sys.stdout.info("Test passed")
        else:
            sys.stdout.info("Test failed")