Exemplo n.º 1
0
def polish(contig_seqs, read_seqs, work_dir, num_iters, num_threads,
           error_mode, output_progress):
    """
    High-level polisher interface
    """

    logger_func = logger.info if output_progress else logger.debug

    subs_matrix = os.path.join(
        cfg.vals["pkg_root"], cfg.vals["err_modes"][error_mode]["subs_matrix"])
    hopo_matrix = os.path.join(
        cfg.vals["pkg_root"], cfg.vals["err_modes"][error_mode]["hopo_matrix"])

    prev_assembly = contig_seqs
    contig_lengths = None
    for i in xrange(num_iters):
        logger_func("Polishing genome ({0}/{1})".format(i + 1, num_iters))

        alignment_file = os.path.join(work_dir,
                                      "minimap_{0}.sam".format(i + 1))
        logger_func("Running minimap2")
        make_alignment(prev_assembly, read_seqs, num_threads, work_dir,
                       error_mode, alignment_file)

        logger_func("Separating alignment into bubbles")
        contigs_info = get_contigs_info(prev_assembly)
        bubbles_file = os.path.join(work_dir,
                                    "bubbles_{0}.fasta".format(i + 1))
        coverage_stats, mean_aln_error = \
            make_bubbles(alignment_file, contigs_info, prev_assembly,
                         error_mode, num_threads,
                         bubbles_file)
        logger_func("Alignment error rate: {0}".format(mean_aln_error))

        logger_func("Correcting bubbles")
        consensus_out = os.path.join(work_dir,
                                     "consensus_{0}.fasta".format(i + 1))
        polished_file = os.path.join(work_dir,
                                     "polished_{0}.fasta".format(i + 1))
        _run_polish_bin(bubbles_file, subs_matrix, hopo_matrix, consensus_out,
                        num_threads)
        polished_fasta, polished_lengths = _compose_sequence([consensus_out])
        fp.write_fasta_dict(polished_fasta, polished_file)

        contig_lengths = polished_lengths
        prev_assembly = polished_file

    stats_file = os.path.join(work_dir, "contigs_stats.txt")
    with open(stats_file, "w") as f:
        f.write("seq_name\tlength\tcoverage\n")
        for ctg_id in contig_lengths:
            f.write("{0}\t{1}\t{2}\n".format(ctg_id, contig_lengths[ctg_id],
                                             coverage_stats[ctg_id]))
Exemplo n.º 2
0
def polish(contig_seqs, read_seqs, work_dir, num_iters, num_threads,
           error_mode, output_progress):
    """
    High-level polisher interface
    """
    logger_state = logger.disabled
    if not output_progress:
        logger.disabled = True

    subs_matrix = os.path.join(
        cfg.vals["pkg_root"], cfg.vals["err_modes"][error_mode]["subs_matrix"])
    hopo_matrix = os.path.join(
        cfg.vals["pkg_root"], cfg.vals["err_modes"][error_mode]["hopo_matrix"])
    stats_file = os.path.join(work_dir, "contigs_stats.txt")

    prev_assembly = contig_seqs
    contig_lengths = None
    coverage_stats = None
    for i in xrange(num_iters):
        logger.info("Polishing genome ({0}/{1})".format(i + 1, num_iters))

        #split into 1Mb chunks to reduce RAM usage
        #slightly vary chunk size between iterations
        CHUNK_SIZE = 1000000 - (i % 2) * 100000
        chunks_file = os.path.join(work_dir, "chunks_{0}.fasta".format(i + 1))
        chunks = split_into_chunks(fp.read_sequence_dict(prev_assembly),
                                   CHUNK_SIZE)
        fp.write_fasta_dict(chunks, chunks_file)

        ####
        logger.info("Running minimap2")
        alignment_file = os.path.join(work_dir,
                                      "minimap_{0}.sam".format(i + 1))
        make_alignment(chunks_file,
                       read_seqs,
                       num_threads,
                       work_dir,
                       error_mode,
                       alignment_file,
                       reference_mode=True,
                       sam_output=True)

        #####
        logger.info("Separating alignment into bubbles")
        contigs_info = get_contigs_info(chunks_file)
        bubbles_file = os.path.join(work_dir,
                                    "bubbles_{0}.fasta".format(i + 1))
        coverage_stats, mean_aln_error = \
            make_bubbles(alignment_file, contigs_info, chunks_file,
                         error_mode, num_threads,
                         bubbles_file)

        logger.info("Alignment error rate: {0}".format(mean_aln_error))
        consensus_out = os.path.join(work_dir,
                                     "consensus_{0}.fasta".format(i + 1))
        polished_file = os.path.join(work_dir,
                                     "polished_{0}.fasta".format(i + 1))
        if os.path.getsize(bubbles_file) == 0:
            logger.info("No reads were aligned during polishing")
            if not output_progress:
                logger.disabled = logger_state
            open(stats_file, "w").write("#seq_name\tlength\tcoverage\n")
            open(polished_file, "w")
            return polished_file, stats_file

        #####
        logger.info("Correcting bubbles")
        _run_polish_bin(bubbles_file, subs_matrix, hopo_matrix, consensus_out,
                        num_threads, output_progress)
        polished_fasta, polished_lengths = _compose_sequence(consensus_out)
        merged_chunks = merge_chunks(polished_fasta)
        fp.write_fasta_dict(merged_chunks, polished_file)

        #Cleanup
        os.remove(chunks_file)
        os.remove(bubbles_file)
        os.remove(consensus_out)
        os.remove(alignment_file)

        contig_lengths = polished_lengths
        prev_assembly = polished_file

    #merge information from chunks
    contig_lengths = merge_chunks(contig_lengths, fold_function=sum)
    coverage_stats = merge_chunks(coverage_stats,
                                  fold_function=lambda l: sum(l) / len(l))

    with open(stats_file, "w") as f:
        f.write("#seq_name\tlength\tcoverage\n")
        for ctg_id in contig_lengths:
            f.write("{0}\t{1}\t{2}\n".format(ctg_id, contig_lengths[ctg_id],
                                             coverage_stats[ctg_id]))

    if not output_progress:
        logger.disabled = logger_state

    return prev_assembly, stats_file
Exemplo n.º 3
0
def polish(contig_seqs, read_seqs, work_dir, num_iters, num_threads,
           read_platform, read_type, output_progress):
    """
    High-level polisher interface
    """
    logger_state = logger.disabled
    if not output_progress:
        logger.disabled = True

    subs_matrix = os.path.join(
        cfg.vals["pkg_root"],
        cfg.vals["err_modes"][read_platform]["subs_matrix"])
    hopo_matrix = os.path.join(
        cfg.vals["pkg_root"],
        cfg.vals["err_modes"][read_platform]["hopo_matrix"])
    use_hopo = cfg.vals["err_modes"][read_platform]["hopo_enabled"]
    use_hopo = use_hopo and (read_type == "raw")
    stats_file = os.path.join(work_dir, "contigs_stats.txt")

    bam_input = read_seqs[0].endswith("bam")

    prev_assembly = contig_seqs
    contig_lengths = None
    coverage_stats = None
    for i in range(num_iters):
        logger.info("Polishing genome (%d/%d)", i + 1, num_iters)

        ####
        if not bam_input:
            logger.info("Running minimap2")
            alignment_file = os.path.join(work_dir,
                                          "minimap_{0}.bam".format(i + 1))
            make_alignment(prev_assembly,
                           read_seqs,
                           num_threads,
                           work_dir,
                           read_platform,
                           alignment_file,
                           reference_mode=True,
                           sam_output=True)
        else:
            logger.info("Polishing with provided bam")
            alignment_file = read_seqs[0]

        #####
        logger.info("Separating alignment into bubbles")
        contigs_info = get_contigs_info(prev_assembly)
        bubbles_file = os.path.join(work_dir,
                                    "bubbles_{0}.fasta".format(i + 1))
        coverage_stats, mean_aln_error = \
            make_bubbles(alignment_file, contigs_info, prev_assembly,
                         read_platform, num_threads,
                         bubbles_file)

        logger.info("Alignment error rate: %f", mean_aln_error)
        consensus_out = os.path.join(work_dir,
                                     "consensus_{0}.fasta".format(i + 1))
        polished_file = os.path.join(work_dir,
                                     "polished_{0}.fasta".format(i + 1))
        if os.path.getsize(bubbles_file) == 0:
            logger.info("No reads were aligned during polishing")
            if not output_progress:
                logger.disabled = logger_state
            open(stats_file, "w").write("#seq_name\tlength\tcoverage\n")
            open(polished_file, "w")
            return polished_file, stats_file

        #####
        logger.info("Correcting bubbles")
        _run_polish_bin(bubbles_file, subs_matrix, hopo_matrix, consensus_out,
                        num_threads, output_progress, use_hopo)
        polished_fasta, polished_lengths = _compose_sequence(consensus_out)
        fp.write_fasta_dict(polished_fasta, polished_file)

        #Cleanup
        os.remove(bubbles_file)
        os.remove(consensus_out)
        if not bam_input:
            os.remove(alignment_file)

        contig_lengths = polished_lengths
        prev_assembly = polished_file

    with open(stats_file, "w") as f:
        f.write("#seq_name\tlength\tcoverage\n")
        for ctg_id in contig_lengths:
            f.write("{0}\t{1}\t{2}\n".format(ctg_id, contig_lengths[ctg_id],
                                             coverage_stats[ctg_id]))

    if not output_progress:
        logger.disabled = logger_state

    return prev_assembly, stats_file