Exemplo n.º 1
0
 def test3(self):
     lines = NewLineStorage(DisjointigCollection(), self.aligner)
     line = lines.addNew("ACGTACGTACGT", "c")
     dp = LineDotPlot(lines, self.aligner)
     al1 = AlignmentPiece.Identical(line.segment(0, 8), line.segment(4, 12))
     al2 = AlignmentPiece.Identical(line.segment(0, 4), line.segment(8, 12))
     dp.addAlignment(al1)
     dp.addAlignment(al2)
     alignment = AlignmentPiece.Identical(
         Contig("TCC", "tmp").asSegment(), line.segment(3, 6))
     line.correctSequence([alignment])
     assert str(
         list(dp.auto_alignments["c"])
     ) == "[(c[1:12-4]->c[5:12-0]:0.86), (c[0:4]->c[8:12-0]:1.000), (c[5:12-0]->c[1:12-4]:0.86), (c[8:12-0]->c[0:4]:1.000), (c[0:12-0]->c[0:12-0]:1.000)]"
Exemplo n.º 2
0
 def test1(self):
     lines = NewLineStorage(DisjointigCollection(), self.aligner)
     line1 = lines.addNew("ACGTAAAAGGGTACGT", "c1")
     line2 = lines.addNew("ACGTAAGGGGGTACGT", "c2")
     al = self.scorer.polyshAlignment(
         AlignmentPiece.Identical(line1.asSegment(), line2.asSegment()),
         params.alignment_correction_radius)
     dp = LineDotPlot(lines, self.aligner)
     dp.addAlignment(al)
     alignment = AlignmentPiece.Identical(
         Contig("AGG", "tmp").asSegment(), line2.segment(0, 3))
     line2.correctSequence([alignment])
     assert str(list(dp.alignmentsToFrom[line2.id][
         line1.id])) == "[(c1[0:16-0]->c2[0:16-0]:0.81)]"
Exemplo n.º 3
0
 def genAll(self, aligner):
     # type: (Aligner) -> Tuple[NewLineStorage, LineDotPlot, ReadCollection]
     disjointigs = DisjointigCollection()
     for dis in self.disjointigs:
         disjointigs.addNew(dis.seq, dis.id)
     from disjointig_resolve.line_storage import NewLineStorage
     lines = NewLineStorage(disjointigs, aligner)
     lines.name_printer = lambda line: line.id + "_" + self.translateBack(
         line, aligner)
     for line in self.contigs:
         new_line = lines.addNew(line.seq, line.id)
         new_line.initial.add(
             AlignmentPiece.Identical(
                 new_line.asSegment().asContig().asSegment(),
                 new_line.asSegment()))
     dp = LineDotPlot(lines, aligner)
     dp.construct(aligner)
     lines.alignDisjointigs()
     reads = ReadCollection()
     for read in self.reads:
         reads.addNewRead(read)
     disjointigs.addAlignments(aligner.localAlign(reads, disjointigs))
     return lines, dp, reads
Exemplo n.º 4
0
def loadAll(handler):
    # type: (TokenReader) -> Tuple[Params, Aligner, ContigCollection, ReadCollection, DisjointigCollection, NewLineStorage, LineDotPlot]
    cl_params = Params()
    cl_params.load(handler)
    aligner = Aligner.load(handler)
    sys.stdout.info("Loading contigs")
    contigs = ContigCollection()
    contigs.load(handler)
    sys.stdout.info("Loading reads")
    reads = CreateReadCollection(cl_params.reads_file, cl_params.downsample)
    reads.loadFromFasta(open(cl_params.reads_file, "r"),
                        downsample=params.downsample)
    tmp_reads = reads.copy().addAllRC()
    sys.stdout.info("Loading disjointigs")
    disjointigs = DisjointigCollection()
    disjointigs.load(handler, tmp_reads)
    sys.stdout.info("Loading lines")
    lines = NewLineStorage(disjointigs, aligner)
    lines.load(handler, tmp_reads, contigs)
    sys.stdout.info("Loading dot plot")
    dot_plot = LineDotPlot(lines, aligner)
    dot_plot.load(handler)
    sys.stdout.info("Loading finished")
    return cl_params, aligner, contigs, reads, disjointigs, lines, dot_plot
Exemplo n.º 5
0
def assemble(args, bin_path):
    params.bin_path = bin_path
    start = time.time()
    cl_params = Params().parse(args)
    ref = ContigStorage()
    if cl_params.test:
        cl_params.reads_file = os.path.dirname(__file__)  + "/../../test_dataset/reads.fasta"
        cl_params.genome_size = 30000
        cl_params.dir = os.path.dirname(__file__)  + "/../../test_results"
        ref.loadFromFile(os.path.dirname(__file__)  + "/../../test_dataset/axbctbdy.fasta", False)
    if cl_params.debug:
        params.save_alignments = True
    cl_params.check()
    CreateLog(cl_params.dir)
    sys.stdout.info("Command line:", " ".join(cl_params.args))
    sys.stdout.info("Started")
    if cl_params.debug:
        sys.stdout.info("Version:", subprocess.check_output(["git", "rev-parse", "HEAD"]))
        sys.stdout.info("Modifications:")
        print subprocess.check_output(["git", "diff"])
    sys.stdout.info("Preparing initial state")
    if cl_params.debug:
        save_handler = SaveHandler(os.path.join(cl_params.dir, "saves"))
    else:
        save_handler = None
    if cl_params.load_from is not None:
        # tmp = cl_params.focus
        sys.stdout.info("Loading initial state from saves")
        cl_params, aligner, contigs, reads, disjointigs, lines, dot_plot = loadAll(TokenReader(open(cl_params.load_from, "r")))
        cl_params.parse(args)
        # cl_params.focus = tmp
        knotter = LineMerger(lines, Polisher(aligner, aligner.dir_distributor), dot_plot)
        extender = LineExtender(aligner, knotter, disjointigs, dot_plot)
        dot_plot.printAll(sys.stdout)
        printState(lines)
    else:
        aligner = Aligner(DirDistributor(cl_params.alignmentDir()))
        polisher = Polisher(aligner, aligner.dir_distributor)

        reads = CreateReadCollection(cl_params.reads_file, cl_params.cut_reads, cl_params.downsample)


        if cl_params.contigs_file is None:
            sys.stdout.info("Running Flye")
            assembly_dir = os.path.join(cl_params.dir, "assembly_initial")
            reads_file = os.path.join(cl_params.dir, "actual_reads.fasta")
            reads.print_fasta(open(reads_file, "w"))
            subprocess.check_call([os.path.join(params.bin_path, "flye"), "--meta", "-o", assembly_dir, "-t", str(cl_params.threads), "--" + params.technology + "-raw", reads_file, "--genome-size", str(cl_params.genome_size), "--min-overlap", str(params.k)])
            cl_params.set_flye_dir(assembly_dir, cl_params.mode)
        elif len(cl_params.disjointigs_file_list) == 0:
            assembly_dir = os.path.join(cl_params.dir, "assembly_initial")
            reads_file = os.path.join(cl_params.dir, "actual_reads.fasta")
            reads.print_fasta(open(reads_file, "w"))
            disjointigs_file = constructDisjointigs(reads, params.expected_size, assembly_dir)
            # graph_file, contigs_file, disjointigs_file, rep_dir, graph_file_after, contigs_file_after = parseFlyeDir(assembly_dir)
            cl_params.disjointigs_file_list.append(disjointigs_file)
            params.min_contra_for_break = 8

        disjointigs = CreateDisjointigCollection(cl_params.disjointigs_file_list, cl_params.dir, aligner, reads)

        all_unique = cl_params.init_file is not None
        contigs = CreateContigCollection(cl_params.graph_file, cl_params.contigs_file, cl_params.min_cov, aligner, polisher, reads, cl_params.force_unique, all_unique)

        if cl_params.autoKL:
            adjustKL(aligner, reads, contigs)

        if cl_params.init_file is None:
            ExtendShortContigs(contigs, reads, aligner, polisher, cl_params.read_dump)
            lines = CreateLineCollection(cl_params.dir, aligner, contigs, disjointigs, reads, cl_params.split)
        else:
            lines = LoadLineCollection(cl_params.dir, cl_params.init_file, aligner, contigs, disjointigs, reads, polisher)

        sys.stdout.info("Constructing dot plot")
        dot_plot = LineDotPlot(lines, aligner)
        dot_plot.construct(aligner)
        # dot_plot.printAll(sys.stdout)

        sys.stdout.info("Updating sequences and resolved segments.")
        knotter = LineMerger(lines, Polisher(aligner, aligner.dir_distributor), dot_plot)
        extender = LineExtender(aligner, knotter, disjointigs, dot_plot)
        extender.updateAllStructures(itertools.chain.from_iterable(line.completely_resolved for line in lines))
        for line in list(lines.unique()): # type: NewLine
            line.completely_resolved.mergeSegments()
            if len(line.completely_resolved) == 0:
                lines.removeLine(line)
        if cl_params.debug:
            sys.stdout.info( "Saving initial state")
            try:
                writer = save_handler.getWriter()
                sys.stdout.info("Save details:", writer.info)
                saveAll(writer, cl_params, aligner, contigs, reads, disjointigs, lines, dot_plot)
            except Exception as e:
                _, _, tb = sys.exc_info()
                sys.stdout.warn("Could not write save")
                traceback.print_tb(tb)
                sys.stdout.INFO( "Message:", e.message)

    sys.stdout.trace( "Disjointig alignments")
    for line in lines:
        sys.stdout.trace( line.disjointig_alignments)
    sys.stdout.info("Starting expanding alignment-consensus loop")

    EACL(aligner, cl_params, contigs, disjointigs, dot_plot, extender, lines, reads, save_handler)

    dot_plot.printAll(sys.stdout)

    sys.stdout.trace( "Final result:")
    lines.printToFasta(open(os.path.join(cl_params.dir, "lines.fasta"), "w"))
    lines.printKnottedToFasta(open(os.path.join(cl_params.dir, "assembly.fasta"), "w"))
    printState(lines)
    sys.stdout.info("Finished")
    secs = int(time.time() - start)
    days = secs / 60 / 60 / 24
    hours = secs / 60 / 60 % 24
    mins = secs / 60 % 60
    sys.stdout.info("Finished in %d days, %d hours, %d minutes" % (days, hours, mins))
    if cl_params.test:
        passed = False
        for al in aligner.dotplotAlign(lines, ref):
            if len(al) > len(al.seg_to.contig) - 3000:
                passed = True
                break
        if passed:
            sys.stdout.info("Test passed")
        else:
            sys.stdout.info("Test failed")