def run(self): super(JobConsensus, self).run() if not os.path.isdir(self.consensus_dir): os.mkdir(self.consensus_dir) #split into 1Mb chunks to reduce RAM usage CHUNK_SIZE = 1000000 chunks_file = os.path.join(self.consensus_dir, "chunks.fasta") chunks = aln.split_into_chunks(fp.read_sequence_dict(self.in_contigs), CHUNK_SIZE) fp.write_fasta_dict(chunks, chunks_file) logger.info("Running Minimap2") out_alignment = os.path.join(self.consensus_dir, "minimap.bam") aln.make_alignment(chunks_file, self.args.reads, self.args.threads, self.consensus_dir, self.args.platform, out_alignment, reference_mode=True, sam_output=True) contigs_info = aln.get_contigs_info(chunks_file) logger.info("Computing consensus") consensus_fasta = cons.get_consensus(out_alignment, chunks_file, contigs_info, self.args.threads, self.args.platform) #merge chunks back into single sequences merged_fasta = aln.merge_chunks(consensus_fasta) fp.write_fasta_dict(merged_fasta, self.out_consensus) os.remove(chunks_file) os.remove(out_alignment)
def run(self): super(JobTrestle, self).run() if not os.path.isdir(self.work_dir): os.mkdir(self.work_dir) summary_file = os.path.join(self.work_dir, "trestle_summary.txt") resolved_repeats_seqs = os.path.join(self.work_dir, "resolved_copies.fasta") repeat_graph = RepeatGraph(fp.read_sequence_dict(self.graph_edges)) repeat_graph.load_from_file(self.repeat_graph) try: repeats_info = tres_graph \ .get_simple_repeats(repeat_graph, self.reads_alignment_file, fp.read_sequence_dict(self.graph_edges)) tres_graph.dump_repeats( repeats_info, os.path.join(self.work_dir, "repeats_dump")) tres.resolve_repeats(self.args, self.work_dir, repeats_info, summary_file, resolved_repeats_seqs) tres_graph.apply_changes( repeat_graph, summary_file, fp.read_sequence_dict(resolved_repeats_seqs)) except KeyboardInterrupt as e: raise #except Exception as e: # logger.warning("Caught unhandled exception: " + str(e)) # logger.warning("Continuing to the next pipeline stage. " # "Please submit a bug report along with the full log file") repeat_graph.dump_to_file(self.out_files["repeat_graph"]) fp.write_fasta_dict(repeat_graph.edges_fasta, self.out_files["repeat_graph_edges"])
def generate_scaffolds(contigs_file, links_file, out_scaffolds): contigs_fasta = fp.read_sequence_dict(contigs_file) scaffolds_fasta = {} used_contigs = set() connections = {} with open(links_file, "r") as f: for line in f: line = line.strip() if not line: continue ctg_1, sign_1, ctg_2, sign_2 = line.split("\t") if ctg_1 in contigs_fasta and ctg_2 in contigs_fasta: connections[sign_1 + ctg_1] = sign_2 + ctg_2 connections[rc(sign_2) + ctg_2] = rc(sign_1) + ctg_1 scaffolds_fasta = {} scaffolds_seq = {} for ctg in contigs_fasta: if ctg in used_contigs: continue used_contigs.add(ctg) scf = ["-" + ctg] #extending right while (scf[-1] in connections and unsigned(connections[scf[-1]]) not in used_contigs): scf.append(connections[scf[-1]]) used_contigs.add(unsigned(scf[-1])) for i, ctg in enumerate(scf): scf[i] = rc(ctg[0]) + unsigned(ctg) scf = scf[::-1] #extending left while (scf[-1] in connections and unsigned(connections[scf[-1]]) not in used_contigs): scf.append(connections[scf[-1]]) used_contigs.add(unsigned(scf[-1])) #generating sequence interleaved by Ns if len(scf) == 1: scaffolds_fasta[unsigned(ctg)] = contigs_fasta[unsigned(ctg)] scaffolds_seq[unsigned(ctg)] = scf else: scf_name = "scaffold_" + unsigned(scf[0]).strip("contig_") scaffolds_seq[scf_name] = scf scf_seq = [] for scf_ctg in scf: if scf_ctg[0] == "+": scf_seq.append(contigs_fasta[unsigned(scf_ctg)]) else: scf_seq.append( fp.reverse_complement( contigs_fasta[unsigned(scf_ctg)])) gap = "N" * cfg.vals["scaffold_gap"] scaffolds_fasta[scf_name] = gap.join(scf_seq) fp.write_fasta_dict(scaffolds_fasta, out_scaffolds) return scaffolds_seq
def polish(contig_seqs, read_seqs, work_dir, num_iters, num_threads, error_mode, output_progress): """ High-level polisher interface """ logger_func = logger.info if output_progress else logger.debug subs_matrix = os.path.join( cfg.vals["pkg_root"], cfg.vals["err_modes"][error_mode]["subs_matrix"]) hopo_matrix = os.path.join( cfg.vals["pkg_root"], cfg.vals["err_modes"][error_mode]["hopo_matrix"]) prev_assembly = contig_seqs contig_lengths = None for i in xrange(num_iters): logger_func("Polishing genome ({0}/{1})".format(i + 1, num_iters)) alignment_file = os.path.join(work_dir, "minimap_{0}.sam".format(i + 1)) logger_func("Running minimap2") make_alignment(prev_assembly, read_seqs, num_threads, work_dir, error_mode, alignment_file) logger_func("Separating alignment into bubbles") contigs_info = get_contigs_info(prev_assembly) bubbles_file = os.path.join(work_dir, "bubbles_{0}.fasta".format(i + 1)) coverage_stats, mean_aln_error = \ make_bubbles(alignment_file, contigs_info, prev_assembly, error_mode, num_threads, bubbles_file) logger_func("Alignment error rate: {0}".format(mean_aln_error)) logger_func("Correcting bubbles") consensus_out = os.path.join(work_dir, "consensus_{0}.fasta".format(i + 1)) polished_file = os.path.join(work_dir, "polished_{0}.fasta".format(i + 1)) _run_polish_bin(bubbles_file, subs_matrix, hopo_matrix, consensus_out, num_threads) polished_fasta, polished_lengths = _compose_sequence([consensus_out]) fp.write_fasta_dict(polished_fasta, polished_file) contig_lengths = polished_lengths prev_assembly = polished_file stats_file = os.path.join(work_dir, "contigs_stats.txt") with open(stats_file, "w") as f: f.write("seq_name\tlength\tcoverage\n") for ctg_id in contig_lengths: f.write("{0}\t{1}\t{2}\n".format(ctg_id, contig_lengths[ctg_id], coverage_stats[ctg_id]))
def filter_by_coverage(args, stats_in, contigs_in, stats_out, contigs_out): """ Filters out contigs with low coverage """ SUBASM_MIN_COVERAGE = 1 HARD_MIN_COVERAGE = cfg.vals["hard_minimum_coverage"] RELATIVE_MIN_COVERAGE = cfg.vals["relative_minimum_coverage"] ctg_stats = {} sum_cov = 0 sum_length = 0 with open(stats_in, "r") as f: for line in f: if line.startswith("#"): continue tokens = line.split("\t") ctg_id, ctg_len, ctg_cov = tokens[0], int(tokens[1]), int( tokens[2]) ctg_stats[ctg_id] = (ctg_len, ctg_cov) sum_cov += ctg_cov * ctg_len sum_length += ctg_len mean_coverage = int(float(sum_cov) / sum_length) coverage_threshold = None if args.read_type == "subasm": coverage_threshold = SUBASM_MIN_COVERAGE elif args.meta: coverage_threshold = HARD_MIN_COVERAGE else: coverage_threshold = int( round(float(mean_coverage) / RELATIVE_MIN_COVERAGE)) coverage_threshold = max(HARD_MIN_COVERAGE, coverage_threshold) logger.debug("Mean contig coverage: {0}, selected threshold: {1}".format( mean_coverage, coverage_threshold)) filtered_num = 0 filtered_seq = 0 good_fasta = {} for hdr, seq in fp.stream_sequence(contigs_in): if ctg_stats[hdr][1] >= coverage_threshold: good_fasta[hdr] = seq else: filtered_num += 1 filtered_seq += ctg_stats[hdr][0] logger.debug("Filtered {0} contigs of total length {1}".format( filtered_num, filtered_seq)) fp.write_fasta_dict(good_fasta, contigs_out) with open(stats_out, "w") as f: f.write("#seq_name\tlength\tcoverage\n") for ctg_id in good_fasta: f.write("{0}\t{1}\t{2}\n".format(ctg_id, ctg_stats[ctg_id][0], ctg_stats[ctg_id][1]))
def run(self): super(JobShortPlasmidsAssembly, self).run() logger.info("Recovering short unassembled sequences") if not os.path.isdir(self.work_dir): os.mkdir(self.work_dir) plasmids = plas.assemble_short_plasmids(self.args, self.work_dir, self.contigs_path) #updating repeat graph repeat_graph = RepeatGraph(fp.read_sequence_dict(self.graph_edges)) repeat_graph.load_from_file(self.repeat_graph) plas.update_graph(repeat_graph, plasmids) repeat_graph.dump_to_file(self.out_files["repeat_graph"]) fp.write_fasta_dict(repeat_graph.edges_fasta, self.out_files["repeat_graph_edges"])
def split_into_chunks(fasta_in, chunk_size, fasta_out): out_dict = {} for header, seq in fp.stream_sequence(fasta_in): #print len(seq) for i in range(0, max(len(seq) // chunk_size, 1)): chunk_hdr = "{0}$chunk_{1}".format(header, i) start = i * chunk_size end = (i + 1) * chunk_size if len(seq) - end < chunk_size: end = len(seq) #print(start, end) out_dict[chunk_hdr] = seq[start : end] fp.write_fasta_dict(out_dict, fasta_out)
def run(self): if not os.path.isdir(self.consensus_dir): os.mkdir(self.consensus_dir) logger.info("Running Minimap2") out_alignment = os.path.join(self.consensus_dir, "minimap.sam") aln.make_alignment(self.in_contigs, self.args.reads, self.args.threads, self.consensus_dir, self.args.platform, out_alignment) contigs_info = aln.get_contigs_info(self.in_contigs) logger.info("Computing consensus") consensus_fasta = cons.get_consensus(out_alignment, self.in_contigs, contigs_info, self.args.threads, self.args.platform) fp.write_fasta_dict(consensus_fasta, self.out_consensus)
def run(self): super(JobConsensus, self).run() if not os.path.isdir(self.consensus_dir): os.mkdir(self.consensus_dir) logger.info("Running Minimap2") out_alignment = os.path.join(self.consensus_dir, "minimap.bam") aln.make_alignment(self.in_contigs, self.args.reads, self.args.threads, self.consensus_dir, self.args.platform, out_alignment, reference_mode=True, sam_output=True) contigs_info = aln.get_contigs_info(self.in_contigs) logger.info("Computing consensus") consensus_fasta = cons.get_consensus(out_alignment, self.in_contigs, contigs_info, self.args.threads, self.args.platform) fp.write_fasta_dict(consensus_fasta, self.out_consensus) os.remove(out_alignment)
def polish(contig_seqs, read_seqs, work_dir, num_iters, num_threads, error_mode, output_progress): """ High-level polisher interface """ logger_state = logger.disabled if not output_progress: logger.disabled = True subs_matrix = os.path.join( cfg.vals["pkg_root"], cfg.vals["err_modes"][error_mode]["subs_matrix"]) hopo_matrix = os.path.join( cfg.vals["pkg_root"], cfg.vals["err_modes"][error_mode]["hopo_matrix"]) stats_file = os.path.join(work_dir, "contigs_stats.txt") prev_assembly = contig_seqs contig_lengths = None coverage_stats = None for i in xrange(num_iters): logger.info("Polishing genome ({0}/{1})".format(i + 1, num_iters)) #split into 1Mb chunks to reduce RAM usage #slightly vary chunk size between iterations CHUNK_SIZE = 1000000 - (i % 2) * 100000 chunks_file = os.path.join(work_dir, "chunks_{0}.fasta".format(i + 1)) chunks = split_into_chunks(fp.read_sequence_dict(prev_assembly), CHUNK_SIZE) fp.write_fasta_dict(chunks, chunks_file) #### logger.info("Running minimap2") alignment_file = os.path.join(work_dir, "minimap_{0}.sam".format(i + 1)) make_alignment(chunks_file, read_seqs, num_threads, work_dir, error_mode, alignment_file, reference_mode=True, sam_output=True) ##### logger.info("Separating alignment into bubbles") contigs_info = get_contigs_info(chunks_file) bubbles_file = os.path.join(work_dir, "bubbles_{0}.fasta".format(i + 1)) coverage_stats, mean_aln_error = \ make_bubbles(alignment_file, contigs_info, chunks_file, error_mode, num_threads, bubbles_file) logger.info("Alignment error rate: {0}".format(mean_aln_error)) consensus_out = os.path.join(work_dir, "consensus_{0}.fasta".format(i + 1)) polished_file = os.path.join(work_dir, "polished_{0}.fasta".format(i + 1)) if os.path.getsize(bubbles_file) == 0: logger.info("No reads were aligned during polishing") if not output_progress: logger.disabled = logger_state open(stats_file, "w").write("#seq_name\tlength\tcoverage\n") open(polished_file, "w") return polished_file, stats_file ##### logger.info("Correcting bubbles") _run_polish_bin(bubbles_file, subs_matrix, hopo_matrix, consensus_out, num_threads, output_progress) polished_fasta, polished_lengths = _compose_sequence(consensus_out) merged_chunks = merge_chunks(polished_fasta) fp.write_fasta_dict(merged_chunks, polished_file) #Cleanup os.remove(chunks_file) os.remove(bubbles_file) os.remove(consensus_out) os.remove(alignment_file) contig_lengths = polished_lengths prev_assembly = polished_file #merge information from chunks contig_lengths = merge_chunks(contig_lengths, fold_function=sum) coverage_stats = merge_chunks(coverage_stats, fold_function=lambda l: sum(l) / len(l)) with open(stats_file, "w") as f: f.write("#seq_name\tlength\tcoverage\n") for ctg_id in contig_lengths: f.write("{0}\t{1}\t{2}\n".format(ctg_id, contig_lengths[ctg_id], coverage_stats[ctg_id])) if not output_progress: logger.disabled = logger_state return prev_assembly, stats_file
def assemble_short_plasmids(args, work_dir, contigs_path): logger.debug("Extracting unmapped reads") reads2contigs_mapping = os.path.join(work_dir, "reads2contigs.paf") make_alignment(contigs_path, args.reads, args.threads, work_dir, args.platform, reads2contigs_mapping, reference_mode=True, sam_output=False) unmapped_reads_path = os.path.join(work_dir, "unmapped_reads.fasta") unmapped.extract_unmapped_reads(args, reads2contigs_mapping, unmapped_reads_path, mapping_rate_threshold=0.5) logger.debug("Finding self-mappings for unmapped reads") unmapped_reads_mapping = os.path.join(work_dir, "unmapped_ava.paf") make_alignment(unmapped_reads_path, [unmapped_reads_path], args.threads, work_dir, args.platform, unmapped_reads_mapping, reference_mode=False, sam_output=False) logger.debug("Extracting circular reads") circular_reads = circular.extract_circular_reads(unmapped_reads_mapping) logger.debug("Extracted %d circular reads", len(circular_reads)) logger.debug("Extracing circular pairs") circular_pairs = circular.extract_circular_pairs(unmapped_reads_mapping) logger.debug("Extracted %d circular pairs", len(circular_pairs)) #extracting only the necesssary subset of reads (the entire file could be pretty big) interesting_reads = {} for read in circular_reads: interesting_reads[read] = None for pair in circular_pairs: interesting_reads[pair[0].query] = None interesting_reads[pair[0].target] = None for hdr, seq in fp.stream_sequence(unmapped_reads_path): if hdr in interesting_reads: interesting_reads[hdr] = seq trimmed_circular_reads = \ circular.trim_circular_reads(circular_reads, interesting_reads) trimmed_circular_pairs = \ circular.trim_circular_pairs(circular_pairs, interesting_reads) trimmed_sequences_path = os.path.join(work_dir, "trimmed_sequences.fasta") fp.write_fasta_dict(dict(list(trimmed_circular_reads.items()) + list(trimmed_circular_pairs.items())), trimmed_sequences_path) logger.debug("Clustering circular sequences") trimmed_sequences_mapping = os.path.join(work_dir, "trimmed.paf") make_alignment(trimmed_sequences_path, [trimmed_sequences_path], args.threads, work_dir, args.platform, trimmed_sequences_mapping, reference_mode=False, sam_output=False) plasmids = \ circular.extract_unique_plasmids(trimmed_sequences_mapping, trimmed_sequences_path) plasmids_raw = os.path.join(work_dir, "plasmids_raw.fasta") fp.write_fasta_dict(plasmids, plasmids_raw) _, polished_stats = \ pol.polish(plasmids_raw, [unmapped_reads_path], work_dir, 1, args.threads, args.platform, output_progress=False) #extract coverage plasmids_with_coverage = {} if os.path.isfile(polished_stats): with open(polished_stats, "r") as f: for line in f: if line.startswith("#"): continue tokens = line.strip().split() seq_id, coverage = tokens[0], int(tokens[2]) if coverage > 0: plasmids_with_coverage[seq_id] = plasmids[seq_id], coverage logger.info("Added %d extra contigs", len(plasmids_with_coverage)) # remove all unnecesarry files os.remove(reads2contigs_mapping) os.remove(unmapped_reads_path) os.remove(unmapped_reads_mapping) os.remove(trimmed_sequences_path) os.remove(trimmed_sequences_mapping) return plasmids_with_coverage
def generate_polished_edges(edges_file, gfa_file, polished_contigs, work_dir, error_mode, num_threads): """ Generate polished graph edges sequences by extracting them from polished contigs """ logger.debug("Generating polished GFA") alignment_file = os.path.join(work_dir, "edges_aln.sam") polished_dict = fp.read_sequence_dict(polished_contigs) make_alignment(polished_contigs, [edges_file], num_threads, work_dir, error_mode, alignment_file) aln_reader = SynchronizedSamReader(alignment_file, polished_dict, cfg.vals["max_read_coverage"]) aln_reader.init_reading() aln_by_edge = defaultdict(list) #getting one best alignment for each contig while not aln_reader.is_eof(): _, ctg_aln = aln_reader.get_chunk() for aln in ctg_aln: aln_by_edge[aln.qry_id].append(aln) MIN_CONTAINMENT = 0.9 updated_seqs = 0 edges_dict = fp.read_sequence_dict(edges_file) for edge in edges_dict: if edge in aln_by_edge: main_aln = aln_by_edge[edge][0] map_start = main_aln.qry_start map_end = main_aln.qry_end for aln in aln_by_edge[edge]: if aln.trg_id == main_aln.trg_id and aln.trg_sign == main_aln.trg_sign: map_start = min(map_start, aln.qry_start) map_end = max(map_end, aln.qry_end) new_seq = polished_dict[main_aln.trg_id][map_start:map_end] if main_aln.trg_sign < 0: new_seq = fp.reverse_complement(new_seq) #print edge, main_aln.qry_len, len(new_seq), main_aln.qry_start, main_aln.qry_end if float(len(new_seq)) / aln.qry_len > MIN_CONTAINMENT: edges_dict[edge] = new_seq updated_seqs += 1 #writes fasta file with polished egdes edges_polished = os.path.join(work_dir, "polished_edges.fasta") fp.write_fasta_dict(edges_dict, edges_polished) #writes gfa file with polished edges gfa_polished = open(os.path.join(work_dir, "polished_edges.gfa"), "w") for line in open(gfa_file, "r"): if line.startswith("S"): seq_id = line.split()[1] coverage_tag = line.split()[3] gfa_polished.write("S\t{0}\t{1}\t{2}\n".format( seq_id, edges_dict[seq_id], coverage_tag)) else: gfa_polished.write(line) logger.debug("{0} sequences remained unpolished".format( len(edges_dict) - updated_seqs))
def polish(contig_seqs, read_seqs, work_dir, num_iters, num_threads, read_platform, read_type, output_progress): """ High-level polisher interface """ logger_state = logger.disabled if not output_progress: logger.disabled = True subs_matrix = os.path.join( cfg.vals["pkg_root"], cfg.vals["err_modes"][read_platform]["subs_matrix"]) hopo_matrix = os.path.join( cfg.vals["pkg_root"], cfg.vals["err_modes"][read_platform]["hopo_matrix"]) use_hopo = cfg.vals["err_modes"][read_platform]["hopo_enabled"] use_hopo = use_hopo and (read_type == "raw") stats_file = os.path.join(work_dir, "contigs_stats.txt") bam_input = read_seqs[0].endswith("bam") prev_assembly = contig_seqs contig_lengths = None coverage_stats = None for i in range(num_iters): logger.info("Polishing genome (%d/%d)", i + 1, num_iters) #### if not bam_input: logger.info("Running minimap2") alignment_file = os.path.join(work_dir, "minimap_{0}.bam".format(i + 1)) make_alignment(prev_assembly, read_seqs, num_threads, work_dir, read_platform, alignment_file, reference_mode=True, sam_output=True) else: logger.info("Polishing with provided bam") alignment_file = read_seqs[0] ##### logger.info("Separating alignment into bubbles") contigs_info = get_contigs_info(prev_assembly) bubbles_file = os.path.join(work_dir, "bubbles_{0}.fasta".format(i + 1)) coverage_stats, mean_aln_error = \ make_bubbles(alignment_file, contigs_info, prev_assembly, read_platform, num_threads, bubbles_file) logger.info("Alignment error rate: %f", mean_aln_error) consensus_out = os.path.join(work_dir, "consensus_{0}.fasta".format(i + 1)) polished_file = os.path.join(work_dir, "polished_{0}.fasta".format(i + 1)) if os.path.getsize(bubbles_file) == 0: logger.info("No reads were aligned during polishing") if not output_progress: logger.disabled = logger_state open(stats_file, "w").write("#seq_name\tlength\tcoverage\n") open(polished_file, "w") return polished_file, stats_file ##### logger.info("Correcting bubbles") _run_polish_bin(bubbles_file, subs_matrix, hopo_matrix, consensus_out, num_threads, output_progress, use_hopo) polished_fasta, polished_lengths = _compose_sequence(consensus_out) fp.write_fasta_dict(polished_fasta, polished_file) #Cleanup os.remove(bubbles_file) os.remove(consensus_out) if not bam_input: os.remove(alignment_file) contig_lengths = polished_lengths prev_assembly = polished_file with open(stats_file, "w") as f: f.write("#seq_name\tlength\tcoverage\n") for ctg_id in contig_lengths: f.write("{0}\t{1}\t{2}\n".format(ctg_id, contig_lengths[ctg_id], coverage_stats[ctg_id])) if not output_progress: logger.disabled = logger_state return prev_assembly, stats_file
def assemble_short_plasmids(args, work_dir, contigs_path): logger.debug("Assembling short plasmids") reads2contigs_mapping = os.path.join(work_dir, "reads2contigs.paf") make_alignment(contigs_path, args.reads, args.threads, work_dir, args.platform, reads2contigs_mapping, reference_mode=True, sam_output=False) logger.debug("Extracting unmapped reads") unmapped_reads, n_processed_reads = \ unmapped.extract_unmapped_reads(args, reads2contigs_mapping, mapping_rate_threshold=0.5) n_unmapped_reads = len(unmapped_reads) unmapped_reads_ratio = 100 * float(len(unmapped_reads)) / n_processed_reads unmapped_reads_ratio = round(unmapped_reads_ratio, 1) logger.debug("Extracted {} unmapped reads ({} %)".format( n_unmapped_reads, unmapped_reads_ratio)) unmapped_reads_path = os.path.join(work_dir, "unmapped_reads.fasta") fp.write_fasta_dict(unmapped_reads, unmapped_reads_path) unmapped_reads_mapping = os.path.join(work_dir, "unmapped_ava.paf") logger.debug("Finding self-mappings for unmapped reads") make_alignment(unmapped_reads_path, [unmapped_reads_path], args.threads, work_dir, args.platform, unmapped_reads_mapping, reference_mode=False, sam_output=False) logger.debug("Extracting circular reads") circular_reads = circular.extract_circular_reads(unmapped_reads_mapping) logger.debug("Extracted {} circular reads".format(len(circular_reads))) logger.debug("Extracing circular pairs") circular_pairs = circular.extract_circular_pairs(unmapped_reads_mapping) logger.debug("Extracted {} circular pairs".format(len(circular_pairs))) logger.debug("Extracting unique plasmids from circular sequences") trimmed_circular_reads = \ circular.trim_circular_reads(circular_reads, unmapped_reads) trimmed_circular_pairs = \ circular.trim_circular_pairs(circular_pairs, unmapped_reads) trimmed_sequences_path = os.path.join(work_dir, "trimmed_sequences.fasta") fp.write_fasta_dict( dict(trimmed_circular_reads.items() + trimmed_circular_pairs.items()), trimmed_sequences_path) trimmed_sequences_mapping = os.path.join(work_dir, "trimmed.paf") make_alignment(trimmed_sequences_path, [trimmed_sequences_path], args.threads, work_dir, args.platform, trimmed_sequences_mapping, reference_mode=False, sam_output=False) plasmids = \ circular.extract_unique_plasmids(trimmed_sequences_mapping, trimmed_sequences_path) plasmids_raw = os.path.join(work_dir, "plasmids_raw.fasta") fp.write_fasta_dict(plasmids, plasmids_raw) pol.polish(plasmids_raw, [unmapped_reads_path], work_dir, 1, args.threads, args.platform, output_progress=False) #extract coverage plasmids_with_coverage = {} if os.path.isfile(os.path.join(work_dir, "contigs_stats.txt")): with open(os.path.join(work_dir, "contigs_stats.txt"), "r") as f: for line in f: if line.startswith("seq"): continue tokens = line.strip().split() seq_id, coverage = tokens[0], int(tokens[2]) if coverage > 0: plasmids_with_coverage[seq_id] = plasmids[seq_id], coverage logger.info("Added {} extra contigs".format(len(plasmids_with_coverage))) # remove all unnecesarry files os.remove(reads2contigs_mapping) os.remove(unmapped_reads_path) os.remove(unmapped_reads_mapping) os.remove(trimmed_sequences_path) os.remove(trimmed_sequences_mapping) return plasmids_with_coverage