Пример #1
0
    def run(self):
        super(JobConsensus, self).run()
        if not os.path.isdir(self.consensus_dir):
            os.mkdir(self.consensus_dir)

        #split into 1Mb chunks to reduce RAM usage
        CHUNK_SIZE = 1000000
        chunks_file = os.path.join(self.consensus_dir, "chunks.fasta")
        chunks = aln.split_into_chunks(fp.read_sequence_dict(self.in_contigs),
                                       CHUNK_SIZE)
        fp.write_fasta_dict(chunks, chunks_file)

        logger.info("Running Minimap2")
        out_alignment = os.path.join(self.consensus_dir, "minimap.bam")
        aln.make_alignment(chunks_file,
                           self.args.reads,
                           self.args.threads,
                           self.consensus_dir,
                           self.args.platform,
                           out_alignment,
                           reference_mode=True,
                           sam_output=True)

        contigs_info = aln.get_contigs_info(chunks_file)
        logger.info("Computing consensus")
        consensus_fasta = cons.get_consensus(out_alignment, chunks_file,
                                             contigs_info, self.args.threads,
                                             self.args.platform)

        #merge chunks back into single sequences
        merged_fasta = aln.merge_chunks(consensus_fasta)
        fp.write_fasta_dict(merged_fasta, self.out_consensus)
        os.remove(chunks_file)
        os.remove(out_alignment)
Пример #2
0
    def run(self):
        super(JobTrestle, self).run()

        if not os.path.isdir(self.work_dir):
            os.mkdir(self.work_dir)

        summary_file = os.path.join(self.work_dir, "trestle_summary.txt")
        resolved_repeats_seqs = os.path.join(self.work_dir,
                                             "resolved_copies.fasta")
        repeat_graph = RepeatGraph(fp.read_sequence_dict(self.graph_edges))
        repeat_graph.load_from_file(self.repeat_graph)

        try:
            repeats_info = tres_graph \
                .get_simple_repeats(repeat_graph, self.reads_alignment_file,
                                    fp.read_sequence_dict(self.graph_edges))
            tres_graph.dump_repeats(
                repeats_info, os.path.join(self.work_dir, "repeats_dump"))

            tres.resolve_repeats(self.args, self.work_dir, repeats_info,
                                 summary_file, resolved_repeats_seqs)
            tres_graph.apply_changes(
                repeat_graph, summary_file,
                fp.read_sequence_dict(resolved_repeats_seqs))
        except KeyboardInterrupt as e:
            raise
        #except Exception as e:
        #    logger.warning("Caught unhandled exception: " + str(e))
        #    logger.warning("Continuing to the next pipeline stage. "
        #                   "Please submit a bug report along with the full log file")

        repeat_graph.dump_to_file(self.out_files["repeat_graph"])
        fp.write_fasta_dict(repeat_graph.edges_fasta,
                            self.out_files["repeat_graph_edges"])
Пример #3
0
def generate_scaffolds(contigs_file, links_file, out_scaffolds):

    contigs_fasta = fp.read_sequence_dict(contigs_file)
    scaffolds_fasta = {}
    used_contigs = set()

    connections = {}
    with open(links_file, "r") as f:
        for line in f:
            line = line.strip()
            if not line: continue
            ctg_1, sign_1, ctg_2, sign_2 = line.split("\t")
            if ctg_1 in contigs_fasta and ctg_2 in contigs_fasta:
                connections[sign_1 + ctg_1] = sign_2 + ctg_2
                connections[rc(sign_2) + ctg_2] = rc(sign_1) + ctg_1

    scaffolds_fasta = {}
    scaffolds_seq = {}
    for ctg in contigs_fasta:
        if ctg in used_contigs: continue

        used_contigs.add(ctg)
        scf = ["-" + ctg]
        #extending right
        while (scf[-1] in connections
               and unsigned(connections[scf[-1]]) not in used_contigs):
            scf.append(connections[scf[-1]])
            used_contigs.add(unsigned(scf[-1]))

        for i, ctg in enumerate(scf):
            scf[i] = rc(ctg[0]) + unsigned(ctg)
        scf = scf[::-1]

        #extending left
        while (scf[-1] in connections
               and unsigned(connections[scf[-1]]) not in used_contigs):
            scf.append(connections[scf[-1]])
            used_contigs.add(unsigned(scf[-1]))

        #generating sequence interleaved by Ns
        if len(scf) == 1:
            scaffolds_fasta[unsigned(ctg)] = contigs_fasta[unsigned(ctg)]
            scaffolds_seq[unsigned(ctg)] = scf
        else:
            scf_name = "scaffold_" + unsigned(scf[0]).strip("contig_")
            scaffolds_seq[scf_name] = scf
            scf_seq = []
            for scf_ctg in scf:
                if scf_ctg[0] == "+":
                    scf_seq.append(contigs_fasta[unsigned(scf_ctg)])
                else:
                    scf_seq.append(
                        fp.reverse_complement(
                            contigs_fasta[unsigned(scf_ctg)]))
            gap = "N" * cfg.vals["scaffold_gap"]
            scaffolds_fasta[scf_name] = gap.join(scf_seq)

    fp.write_fasta_dict(scaffolds_fasta, out_scaffolds)
    return scaffolds_seq
Пример #4
0
def polish(contig_seqs, read_seqs, work_dir, num_iters, num_threads,
           error_mode, output_progress):
    """
    High-level polisher interface
    """

    logger_func = logger.info if output_progress else logger.debug

    subs_matrix = os.path.join(
        cfg.vals["pkg_root"], cfg.vals["err_modes"][error_mode]["subs_matrix"])
    hopo_matrix = os.path.join(
        cfg.vals["pkg_root"], cfg.vals["err_modes"][error_mode]["hopo_matrix"])

    prev_assembly = contig_seqs
    contig_lengths = None
    for i in xrange(num_iters):
        logger_func("Polishing genome ({0}/{1})".format(i + 1, num_iters))

        alignment_file = os.path.join(work_dir,
                                      "minimap_{0}.sam".format(i + 1))
        logger_func("Running minimap2")
        make_alignment(prev_assembly, read_seqs, num_threads, work_dir,
                       error_mode, alignment_file)

        logger_func("Separating alignment into bubbles")
        contigs_info = get_contigs_info(prev_assembly)
        bubbles_file = os.path.join(work_dir,
                                    "bubbles_{0}.fasta".format(i + 1))
        coverage_stats, mean_aln_error = \
            make_bubbles(alignment_file, contigs_info, prev_assembly,
                         error_mode, num_threads,
                         bubbles_file)
        logger_func("Alignment error rate: {0}".format(mean_aln_error))

        logger_func("Correcting bubbles")
        consensus_out = os.path.join(work_dir,
                                     "consensus_{0}.fasta".format(i + 1))
        polished_file = os.path.join(work_dir,
                                     "polished_{0}.fasta".format(i + 1))
        _run_polish_bin(bubbles_file, subs_matrix, hopo_matrix, consensus_out,
                        num_threads)
        polished_fasta, polished_lengths = _compose_sequence([consensus_out])
        fp.write_fasta_dict(polished_fasta, polished_file)

        contig_lengths = polished_lengths
        prev_assembly = polished_file

    stats_file = os.path.join(work_dir, "contigs_stats.txt")
    with open(stats_file, "w") as f:
        f.write("seq_name\tlength\tcoverage\n")
        for ctg_id in contig_lengths:
            f.write("{0}\t{1}\t{2}\n".format(ctg_id, contig_lengths[ctg_id],
                                             coverage_stats[ctg_id]))
Пример #5
0
def filter_by_coverage(args, stats_in, contigs_in, stats_out, contigs_out):
    """
    Filters out contigs with low coverage
    """
    SUBASM_MIN_COVERAGE = 1
    HARD_MIN_COVERAGE = cfg.vals["hard_minimum_coverage"]
    RELATIVE_MIN_COVERAGE = cfg.vals["relative_minimum_coverage"]

    ctg_stats = {}
    sum_cov = 0
    sum_length = 0

    with open(stats_in, "r") as f:
        for line in f:
            if line.startswith("#"): continue
            tokens = line.split("\t")
            ctg_id, ctg_len, ctg_cov = tokens[0], int(tokens[1]), int(
                tokens[2])
            ctg_stats[ctg_id] = (ctg_len, ctg_cov)
            sum_cov += ctg_cov * ctg_len
            sum_length += ctg_len

    mean_coverage = int(float(sum_cov) / sum_length)
    coverage_threshold = None
    if args.read_type == "subasm":
        coverage_threshold = SUBASM_MIN_COVERAGE
    elif args.meta:
        coverage_threshold = HARD_MIN_COVERAGE
    else:
        coverage_threshold = int(
            round(float(mean_coverage) / RELATIVE_MIN_COVERAGE))
        coverage_threshold = max(HARD_MIN_COVERAGE, coverage_threshold)
    logger.debug("Mean contig coverage: {0}, selected threshold: {1}".format(
        mean_coverage, coverage_threshold))

    filtered_num = 0
    filtered_seq = 0
    good_fasta = {}
    for hdr, seq in fp.stream_sequence(contigs_in):
        if ctg_stats[hdr][1] >= coverage_threshold:
            good_fasta[hdr] = seq
        else:
            filtered_num += 1
            filtered_seq += ctg_stats[hdr][0]
    logger.debug("Filtered {0} contigs of total length {1}".format(
        filtered_num, filtered_seq))

    fp.write_fasta_dict(good_fasta, contigs_out)
    with open(stats_out, "w") as f:
        f.write("#seq_name\tlength\tcoverage\n")
        for ctg_id in good_fasta:
            f.write("{0}\t{1}\t{2}\n".format(ctg_id, ctg_stats[ctg_id][0],
                                             ctg_stats[ctg_id][1]))
Пример #6
0
    def run(self):
        super(JobShortPlasmidsAssembly, self).run()
        logger.info("Recovering short unassembled sequences")
        if not os.path.isdir(self.work_dir):
            os.mkdir(self.work_dir)
        plasmids = plas.assemble_short_plasmids(self.args, self.work_dir,
                                                self.contigs_path)

        #updating repeat graph
        repeat_graph = RepeatGraph(fp.read_sequence_dict(self.graph_edges))
        repeat_graph.load_from_file(self.repeat_graph)
        plas.update_graph(repeat_graph, plasmids)
        repeat_graph.dump_to_file(self.out_files["repeat_graph"])
        fp.write_fasta_dict(repeat_graph.edges_fasta,
                            self.out_files["repeat_graph_edges"])
Пример #7
0
def split_into_chunks(fasta_in, chunk_size, fasta_out):
    out_dict = {}
    for header, seq in fp.stream_sequence(fasta_in):
        #print len(seq)
        for i in range(0, max(len(seq) // chunk_size, 1)):
            chunk_hdr = "{0}$chunk_{1}".format(header, i)
            start = i * chunk_size
            end = (i + 1) * chunk_size
            if len(seq) - end < chunk_size:
                end = len(seq)

            #print(start, end)
            out_dict[chunk_hdr] = seq[start : end]

    fp.write_fasta_dict(out_dict, fasta_out)
Пример #8
0
    def run(self):
        if not os.path.isdir(self.consensus_dir):
            os.mkdir(self.consensus_dir)

        logger.info("Running Minimap2")
        out_alignment = os.path.join(self.consensus_dir, "minimap.sam")
        aln.make_alignment(self.in_contigs, self.args.reads, self.args.threads,
                           self.consensus_dir, self.args.platform,
                           out_alignment)

        contigs_info = aln.get_contigs_info(self.in_contigs)
        logger.info("Computing consensus")
        consensus_fasta = cons.get_consensus(out_alignment, self.in_contigs,
                                             contigs_info, self.args.threads,
                                             self.args.platform)
        fp.write_fasta_dict(consensus_fasta, self.out_consensus)
Пример #9
0
    def run(self):
        super(JobConsensus, self).run()
        if not os.path.isdir(self.consensus_dir):
            os.mkdir(self.consensus_dir)

        logger.info("Running Minimap2")
        out_alignment = os.path.join(self.consensus_dir, "minimap.bam")
        aln.make_alignment(self.in_contigs,
                           self.args.reads,
                           self.args.threads,
                           self.consensus_dir,
                           self.args.platform,
                           out_alignment,
                           reference_mode=True,
                           sam_output=True)

        contigs_info = aln.get_contigs_info(self.in_contigs)
        logger.info("Computing consensus")
        consensus_fasta = cons.get_consensus(out_alignment, self.in_contigs,
                                             contigs_info, self.args.threads,
                                             self.args.platform)

        fp.write_fasta_dict(consensus_fasta, self.out_consensus)
        os.remove(out_alignment)
Пример #10
0
def polish(contig_seqs, read_seqs, work_dir, num_iters, num_threads,
           error_mode, output_progress):
    """
    High-level polisher interface
    """
    logger_state = logger.disabled
    if not output_progress:
        logger.disabled = True

    subs_matrix = os.path.join(
        cfg.vals["pkg_root"], cfg.vals["err_modes"][error_mode]["subs_matrix"])
    hopo_matrix = os.path.join(
        cfg.vals["pkg_root"], cfg.vals["err_modes"][error_mode]["hopo_matrix"])
    stats_file = os.path.join(work_dir, "contigs_stats.txt")

    prev_assembly = contig_seqs
    contig_lengths = None
    coverage_stats = None
    for i in xrange(num_iters):
        logger.info("Polishing genome ({0}/{1})".format(i + 1, num_iters))

        #split into 1Mb chunks to reduce RAM usage
        #slightly vary chunk size between iterations
        CHUNK_SIZE = 1000000 - (i % 2) * 100000
        chunks_file = os.path.join(work_dir, "chunks_{0}.fasta".format(i + 1))
        chunks = split_into_chunks(fp.read_sequence_dict(prev_assembly),
                                   CHUNK_SIZE)
        fp.write_fasta_dict(chunks, chunks_file)

        ####
        logger.info("Running minimap2")
        alignment_file = os.path.join(work_dir,
                                      "minimap_{0}.sam".format(i + 1))
        make_alignment(chunks_file,
                       read_seqs,
                       num_threads,
                       work_dir,
                       error_mode,
                       alignment_file,
                       reference_mode=True,
                       sam_output=True)

        #####
        logger.info("Separating alignment into bubbles")
        contigs_info = get_contigs_info(chunks_file)
        bubbles_file = os.path.join(work_dir,
                                    "bubbles_{0}.fasta".format(i + 1))
        coverage_stats, mean_aln_error = \
            make_bubbles(alignment_file, contigs_info, chunks_file,
                         error_mode, num_threads,
                         bubbles_file)

        logger.info("Alignment error rate: {0}".format(mean_aln_error))
        consensus_out = os.path.join(work_dir,
                                     "consensus_{0}.fasta".format(i + 1))
        polished_file = os.path.join(work_dir,
                                     "polished_{0}.fasta".format(i + 1))
        if os.path.getsize(bubbles_file) == 0:
            logger.info("No reads were aligned during polishing")
            if not output_progress:
                logger.disabled = logger_state
            open(stats_file, "w").write("#seq_name\tlength\tcoverage\n")
            open(polished_file, "w")
            return polished_file, stats_file

        #####
        logger.info("Correcting bubbles")
        _run_polish_bin(bubbles_file, subs_matrix, hopo_matrix, consensus_out,
                        num_threads, output_progress)
        polished_fasta, polished_lengths = _compose_sequence(consensus_out)
        merged_chunks = merge_chunks(polished_fasta)
        fp.write_fasta_dict(merged_chunks, polished_file)

        #Cleanup
        os.remove(chunks_file)
        os.remove(bubbles_file)
        os.remove(consensus_out)
        os.remove(alignment_file)

        contig_lengths = polished_lengths
        prev_assembly = polished_file

    #merge information from chunks
    contig_lengths = merge_chunks(contig_lengths, fold_function=sum)
    coverage_stats = merge_chunks(coverage_stats,
                                  fold_function=lambda l: sum(l) / len(l))

    with open(stats_file, "w") as f:
        f.write("#seq_name\tlength\tcoverage\n")
        for ctg_id in contig_lengths:
            f.write("{0}\t{1}\t{2}\n".format(ctg_id, contig_lengths[ctg_id],
                                             coverage_stats[ctg_id]))

    if not output_progress:
        logger.disabled = logger_state

    return prev_assembly, stats_file
Пример #11
0
def assemble_short_plasmids(args, work_dir, contigs_path):
    logger.debug("Extracting unmapped reads")
    reads2contigs_mapping = os.path.join(work_dir, "reads2contigs.paf")
    make_alignment(contigs_path, args.reads, args.threads,
                   work_dir, args.platform, reads2contigs_mapping,
                   reference_mode=True, sam_output=False)

    unmapped_reads_path = os.path.join(work_dir, "unmapped_reads.fasta")
    unmapped.extract_unmapped_reads(args, reads2contigs_mapping,
                                    unmapped_reads_path,
                                    mapping_rate_threshold=0.5)

    logger.debug("Finding self-mappings for unmapped reads")
    unmapped_reads_mapping = os.path.join(work_dir, "unmapped_ava.paf")
    make_alignment(unmapped_reads_path, [unmapped_reads_path], args.threads,
                   work_dir, args.platform, unmapped_reads_mapping,
                   reference_mode=False, sam_output=False)

    logger.debug("Extracting circular reads")
    circular_reads = circular.extract_circular_reads(unmapped_reads_mapping)
    logger.debug("Extracted %d circular reads", len(circular_reads))

    logger.debug("Extracing circular pairs")
    circular_pairs = circular.extract_circular_pairs(unmapped_reads_mapping)
    logger.debug("Extracted %d circular pairs", len(circular_pairs))

    #extracting only the necesssary subset of reads (the entire file could be pretty big)
    interesting_reads = {}
    for read in circular_reads:
        interesting_reads[read] = None
    for pair in circular_pairs:
        interesting_reads[pair[0].query] = None
        interesting_reads[pair[0].target] = None
    for hdr, seq in fp.stream_sequence(unmapped_reads_path):
        if hdr in interesting_reads:
            interesting_reads[hdr] = seq

    trimmed_circular_reads = \
        circular.trim_circular_reads(circular_reads, interesting_reads)
    trimmed_circular_pairs = \
        circular.trim_circular_pairs(circular_pairs, interesting_reads)
    trimmed_sequences_path = os.path.join(work_dir, "trimmed_sequences.fasta")
    fp.write_fasta_dict(dict(list(trimmed_circular_reads.items()) +
                             list(trimmed_circular_pairs.items())),
                        trimmed_sequences_path)

    logger.debug("Clustering circular sequences")
    trimmed_sequences_mapping = os.path.join(work_dir, "trimmed.paf")
    make_alignment(trimmed_sequences_path, [trimmed_sequences_path], args.threads,
                   work_dir, args.platform, trimmed_sequences_mapping,
                   reference_mode=False, sam_output=False)

    plasmids = \
        circular.extract_unique_plasmids(trimmed_sequences_mapping,
                                         trimmed_sequences_path)

    plasmids_raw = os.path.join(work_dir, "plasmids_raw.fasta")
    fp.write_fasta_dict(plasmids, plasmids_raw)
    _, polished_stats = \
        pol.polish(plasmids_raw, [unmapped_reads_path], work_dir, 1,
                   args.threads, args.platform, output_progress=False)

    #extract coverage
    plasmids_with_coverage = {}
    if os.path.isfile(polished_stats):
        with open(polished_stats, "r") as f:
            for line in f:
                if line.startswith("#"): continue
                tokens = line.strip().split()
                seq_id, coverage = tokens[0], int(tokens[2])
                if coverage > 0:
                    plasmids_with_coverage[seq_id] = plasmids[seq_id], coverage

    logger.info("Added %d extra contigs", len(plasmids_with_coverage))

    # remove all unnecesarry files
    os.remove(reads2contigs_mapping)
    os.remove(unmapped_reads_path)
    os.remove(unmapped_reads_mapping)
    os.remove(trimmed_sequences_path)
    os.remove(trimmed_sequences_mapping)

    return plasmids_with_coverage
Пример #12
0
def generate_polished_edges(edges_file, gfa_file, polished_contigs, work_dir,
                            error_mode, num_threads):
    """
    Generate polished graph edges sequences by extracting them from
    polished contigs
    """
    logger.debug("Generating polished GFA")

    alignment_file = os.path.join(work_dir, "edges_aln.sam")
    polished_dict = fp.read_sequence_dict(polished_contigs)
    make_alignment(polished_contigs, [edges_file], num_threads, work_dir,
                   error_mode, alignment_file)
    aln_reader = SynchronizedSamReader(alignment_file, polished_dict,
                                       cfg.vals["max_read_coverage"])
    aln_reader.init_reading()
    aln_by_edge = defaultdict(list)

    #getting one best alignment for each contig
    while not aln_reader.is_eof():
        _, ctg_aln = aln_reader.get_chunk()
        for aln in ctg_aln:
            aln_by_edge[aln.qry_id].append(aln)

    MIN_CONTAINMENT = 0.9
    updated_seqs = 0
    edges_dict = fp.read_sequence_dict(edges_file)
    for edge in edges_dict:
        if edge in aln_by_edge:
            main_aln = aln_by_edge[edge][0]
            map_start = main_aln.qry_start
            map_end = main_aln.qry_end
            for aln in aln_by_edge[edge]:
                if aln.trg_id == main_aln.trg_id and aln.trg_sign == main_aln.trg_sign:
                    map_start = min(map_start, aln.qry_start)
                    map_end = max(map_end, aln.qry_end)

            new_seq = polished_dict[main_aln.trg_id][map_start:map_end]
            if main_aln.trg_sign < 0:
                new_seq = fp.reverse_complement(new_seq)

            #print edge, main_aln.qry_len, len(new_seq), main_aln.qry_start, main_aln.qry_end
            if float(len(new_seq)) / aln.qry_len > MIN_CONTAINMENT:
                edges_dict[edge] = new_seq
                updated_seqs += 1

    #writes fasta file with polished egdes
    edges_polished = os.path.join(work_dir, "polished_edges.fasta")
    fp.write_fasta_dict(edges_dict, edges_polished)

    #writes gfa file with polished edges
    gfa_polished = open(os.path.join(work_dir, "polished_edges.gfa"), "w")
    for line in open(gfa_file, "r"):
        if line.startswith("S"):
            seq_id = line.split()[1]
            coverage_tag = line.split()[3]
            gfa_polished.write("S\t{0}\t{1}\t{2}\n".format(
                seq_id, edges_dict[seq_id], coverage_tag))
        else:
            gfa_polished.write(line)

    logger.debug("{0} sequences remained unpolished".format(
        len(edges_dict) - updated_seqs))
Пример #13
0
def polish(contig_seqs, read_seqs, work_dir, num_iters, num_threads,
           read_platform, read_type, output_progress):
    """
    High-level polisher interface
    """
    logger_state = logger.disabled
    if not output_progress:
        logger.disabled = True

    subs_matrix = os.path.join(
        cfg.vals["pkg_root"],
        cfg.vals["err_modes"][read_platform]["subs_matrix"])
    hopo_matrix = os.path.join(
        cfg.vals["pkg_root"],
        cfg.vals["err_modes"][read_platform]["hopo_matrix"])
    use_hopo = cfg.vals["err_modes"][read_platform]["hopo_enabled"]
    use_hopo = use_hopo and (read_type == "raw")
    stats_file = os.path.join(work_dir, "contigs_stats.txt")

    bam_input = read_seqs[0].endswith("bam")

    prev_assembly = contig_seqs
    contig_lengths = None
    coverage_stats = None
    for i in range(num_iters):
        logger.info("Polishing genome (%d/%d)", i + 1, num_iters)

        ####
        if not bam_input:
            logger.info("Running minimap2")
            alignment_file = os.path.join(work_dir,
                                          "minimap_{0}.bam".format(i + 1))
            make_alignment(prev_assembly,
                           read_seqs,
                           num_threads,
                           work_dir,
                           read_platform,
                           alignment_file,
                           reference_mode=True,
                           sam_output=True)
        else:
            logger.info("Polishing with provided bam")
            alignment_file = read_seqs[0]

        #####
        logger.info("Separating alignment into bubbles")
        contigs_info = get_contigs_info(prev_assembly)
        bubbles_file = os.path.join(work_dir,
                                    "bubbles_{0}.fasta".format(i + 1))
        coverage_stats, mean_aln_error = \
            make_bubbles(alignment_file, contigs_info, prev_assembly,
                         read_platform, num_threads,
                         bubbles_file)

        logger.info("Alignment error rate: %f", mean_aln_error)
        consensus_out = os.path.join(work_dir,
                                     "consensus_{0}.fasta".format(i + 1))
        polished_file = os.path.join(work_dir,
                                     "polished_{0}.fasta".format(i + 1))
        if os.path.getsize(bubbles_file) == 0:
            logger.info("No reads were aligned during polishing")
            if not output_progress:
                logger.disabled = logger_state
            open(stats_file, "w").write("#seq_name\tlength\tcoverage\n")
            open(polished_file, "w")
            return polished_file, stats_file

        #####
        logger.info("Correcting bubbles")
        _run_polish_bin(bubbles_file, subs_matrix, hopo_matrix, consensus_out,
                        num_threads, output_progress, use_hopo)
        polished_fasta, polished_lengths = _compose_sequence(consensus_out)
        fp.write_fasta_dict(polished_fasta, polished_file)

        #Cleanup
        os.remove(bubbles_file)
        os.remove(consensus_out)
        if not bam_input:
            os.remove(alignment_file)

        contig_lengths = polished_lengths
        prev_assembly = polished_file

    with open(stats_file, "w") as f:
        f.write("#seq_name\tlength\tcoverage\n")
        for ctg_id in contig_lengths:
            f.write("{0}\t{1}\t{2}\n".format(ctg_id, contig_lengths[ctg_id],
                                             coverage_stats[ctg_id]))

    if not output_progress:
        logger.disabled = logger_state

    return prev_assembly, stats_file
Пример #14
0
def assemble_short_plasmids(args, work_dir, contigs_path):
    logger.debug("Assembling short plasmids")

    reads2contigs_mapping = os.path.join(work_dir, "reads2contigs.paf")
    make_alignment(contigs_path,
                   args.reads,
                   args.threads,
                   work_dir,
                   args.platform,
                   reads2contigs_mapping,
                   reference_mode=True,
                   sam_output=False)

    logger.debug("Extracting unmapped reads")
    unmapped_reads, n_processed_reads = \
        unmapped.extract_unmapped_reads(args, reads2contigs_mapping,
                                        mapping_rate_threshold=0.5)

    n_unmapped_reads = len(unmapped_reads)
    unmapped_reads_ratio = 100 * float(len(unmapped_reads)) / n_processed_reads
    unmapped_reads_ratio = round(unmapped_reads_ratio, 1)
    logger.debug("Extracted {} unmapped reads ({} %)".format(
        n_unmapped_reads, unmapped_reads_ratio))

    unmapped_reads_path = os.path.join(work_dir, "unmapped_reads.fasta")
    fp.write_fasta_dict(unmapped_reads, unmapped_reads_path)

    unmapped_reads_mapping = os.path.join(work_dir, "unmapped_ava.paf")

    logger.debug("Finding self-mappings for unmapped reads")
    make_alignment(unmapped_reads_path, [unmapped_reads_path],
                   args.threads,
                   work_dir,
                   args.platform,
                   unmapped_reads_mapping,
                   reference_mode=False,
                   sam_output=False)

    logger.debug("Extracting circular reads")
    circular_reads = circular.extract_circular_reads(unmapped_reads_mapping)
    logger.debug("Extracted {} circular reads".format(len(circular_reads)))

    logger.debug("Extracing circular pairs")
    circular_pairs = circular.extract_circular_pairs(unmapped_reads_mapping)
    logger.debug("Extracted {} circular pairs".format(len(circular_pairs)))

    logger.debug("Extracting unique plasmids from circular sequences")
    trimmed_circular_reads = \
        circular.trim_circular_reads(circular_reads, unmapped_reads)
    trimmed_circular_pairs = \
        circular.trim_circular_pairs(circular_pairs, unmapped_reads)
    trimmed_sequences_path = os.path.join(work_dir, "trimmed_sequences.fasta")

    fp.write_fasta_dict(
        dict(trimmed_circular_reads.items() + trimmed_circular_pairs.items()),
        trimmed_sequences_path)

    trimmed_sequences_mapping = os.path.join(work_dir, "trimmed.paf")

    make_alignment(trimmed_sequences_path, [trimmed_sequences_path],
                   args.threads,
                   work_dir,
                   args.platform,
                   trimmed_sequences_mapping,
                   reference_mode=False,
                   sam_output=False)

    plasmids = \
        circular.extract_unique_plasmids(trimmed_sequences_mapping,
                                         trimmed_sequences_path)

    plasmids_raw = os.path.join(work_dir, "plasmids_raw.fasta")
    fp.write_fasta_dict(plasmids, plasmids_raw)
    pol.polish(plasmids_raw, [unmapped_reads_path],
               work_dir,
               1,
               args.threads,
               args.platform,
               output_progress=False)

    #extract coverage
    plasmids_with_coverage = {}
    if os.path.isfile(os.path.join(work_dir, "contigs_stats.txt")):
        with open(os.path.join(work_dir, "contigs_stats.txt"), "r") as f:
            for line in f:
                if line.startswith("seq"): continue
                tokens = line.strip().split()
                seq_id, coverage = tokens[0], int(tokens[2])
                if coverage > 0:
                    plasmids_with_coverage[seq_id] = plasmids[seq_id], coverage

    logger.info("Added {} extra contigs".format(len(plasmids_with_coverage)))

    # remove all unnecesarry files
    os.remove(reads2contigs_mapping)
    os.remove(unmapped_reads_path)
    os.remove(unmapped_reads_mapping)
    os.remove(trimmed_sequences_path)
    os.remove(trimmed_sequences_mapping)

    return plasmids_with_coverage