def extract_unmapped_reads(args, reads2contigs_mapping, unmapped_reads_path, mapping_rate_threshold): mapping_rates = calc_mapping_rates(reads2contigs_mapping) total_bases = 0 unmapped_bases = 0 with open(unmapped_reads_path, "w") as fout: for file in args.reads: for hdr, sequence in fp.stream_sequence(file): total_bases += len(sequence) is_unmapped = True contigs = mapping_rates.get(hdr) if contigs is not None: is_unmapped = True for contig, mapping_rate in contigs.iteritems(): if mapping_rate >= mapping_rate_threshold: is_unmapped = False if is_unmapped: unmapped_bases += len(sequence) fout.write(">{0}\n{1}\n".format(hdr, sequence)) logger.debug("Unmapped sequence: {0} / {1} ({2})".format( unmapped_bases, total_bases, float(unmapped_bases) / total_bases))
def filter_by_coverage(args, stats_in, contigs_in, stats_out, contigs_out): """ Filters out contigs with low coverage """ SUBASM_MIN_COVERAGE = 1 HARD_MIN_COVERAGE = cfg.vals["hard_minimum_coverage"] RELATIVE_MIN_COVERAGE = cfg.vals["relative_minimum_coverage"] ctg_stats = {} sum_cov = 0 sum_length = 0 with open(stats_in, "r") as f: for line in f: if line.startswith("#"): continue tokens = line.split("\t") ctg_id, ctg_len, ctg_cov = tokens[0], int(tokens[1]), int( tokens[2]) ctg_stats[ctg_id] = (ctg_len, ctg_cov) sum_cov += ctg_cov * ctg_len sum_length += ctg_len mean_coverage = int(float(sum_cov) / sum_length) coverage_threshold = None if args.read_type == "subasm": coverage_threshold = SUBASM_MIN_COVERAGE elif args.meta: coverage_threshold = HARD_MIN_COVERAGE else: coverage_threshold = int( round(float(mean_coverage) / RELATIVE_MIN_COVERAGE)) coverage_threshold = max(HARD_MIN_COVERAGE, coverage_threshold) logger.debug("Mean contig coverage: {0}, selected threshold: {1}".format( mean_coverage, coverage_threshold)) filtered_num = 0 filtered_seq = 0 good_fasta = {} for hdr, seq in fp.stream_sequence(contigs_in): if ctg_stats[hdr][1] >= coverage_threshold: good_fasta[hdr] = seq else: filtered_num += 1 filtered_seq += ctg_stats[hdr][0] logger.debug("Filtered {0} contigs of total length {1}".format( filtered_num, filtered_seq)) fp.write_fasta_dict(good_fasta, contigs_out) with open(stats_out, "w") as f: f.write("#seq_name\tlength\tcoverage\n") for ctg_id in good_fasta: f.write("{0}\t{1}\t{2}\n".format(ctg_id, ctg_stats[ctg_id][0], ctg_stats[ctg_id][1]))
def split_into_chunks(fasta_in, chunk_size, fasta_out): out_dict = {} for header, seq in fp.stream_sequence(fasta_in): #print len(seq) for i in range(0, max(len(seq) // chunk_size, 1)): chunk_hdr = "{0}$chunk_{1}".format(header, i) start = i * chunk_size end = (i + 1) * chunk_size if len(seq) - end < chunk_size: end = len(seq) #print(start, end) out_dict[chunk_hdr] = seq[start : end] fp.write_fasta_dict(out_dict, fasta_out)
def assemble_short_plasmids(args, work_dir, contigs_path): logger.debug("Extracting unmapped reads") reads2contigs_mapping = os.path.join(work_dir, "reads2contigs.paf") make_alignment(contigs_path, args.reads, args.threads, work_dir, args.platform, reads2contigs_mapping, reference_mode=True, sam_output=False) unmapped_reads_path = os.path.join(work_dir, "unmapped_reads.fasta") unmapped.extract_unmapped_reads(args, reads2contigs_mapping, unmapped_reads_path, mapping_rate_threshold=0.5) logger.debug("Finding self-mappings for unmapped reads") unmapped_reads_mapping = os.path.join(work_dir, "unmapped_ava.paf") make_alignment(unmapped_reads_path, [unmapped_reads_path], args.threads, work_dir, args.platform, unmapped_reads_mapping, reference_mode=False, sam_output=False) logger.debug("Extracting circular reads") circular_reads = circular.extract_circular_reads(unmapped_reads_mapping) logger.debug("Extracted %d circular reads", len(circular_reads)) logger.debug("Extracing circular pairs") circular_pairs = circular.extract_circular_pairs(unmapped_reads_mapping) logger.debug("Extracted %d circular pairs", len(circular_pairs)) #extracting only the necesssary subset of reads (the entire file could be pretty big) interesting_reads = {} for read in circular_reads: interesting_reads[read] = None for pair in circular_pairs: interesting_reads[pair[0].query] = None interesting_reads[pair[0].target] = None for hdr, seq in fp.stream_sequence(unmapped_reads_path): if hdr in interesting_reads: interesting_reads[hdr] = seq trimmed_circular_reads = \ circular.trim_circular_reads(circular_reads, interesting_reads) trimmed_circular_pairs = \ circular.trim_circular_pairs(circular_pairs, interesting_reads) trimmed_sequences_path = os.path.join(work_dir, "trimmed_sequences.fasta") fp.write_fasta_dict(dict(list(trimmed_circular_reads.items()) + list(trimmed_circular_pairs.items())), trimmed_sequences_path) logger.debug("Clustering circular sequences") trimmed_sequences_mapping = os.path.join(work_dir, "trimmed.paf") make_alignment(trimmed_sequences_path, [trimmed_sequences_path], args.threads, work_dir, args.platform, trimmed_sequences_mapping, reference_mode=False, sam_output=False) plasmids = \ circular.extract_unique_plasmids(trimmed_sequences_mapping, trimmed_sequences_path) plasmids_raw = os.path.join(work_dir, "plasmids_raw.fasta") fp.write_fasta_dict(plasmids, plasmids_raw) _, polished_stats = \ pol.polish(plasmids_raw, [unmapped_reads_path], work_dir, 1, args.threads, args.platform, output_progress=False) #extract coverage plasmids_with_coverage = {} if os.path.isfile(polished_stats): with open(polished_stats, "r") as f: for line in f: if line.startswith("#"): continue tokens = line.strip().split() seq_id, coverage = tokens[0], int(tokens[2]) if coverage > 0: plasmids_with_coverage[seq_id] = plasmids[seq_id], coverage logger.info("Added %d extra contigs", len(plasmids_with_coverage)) # remove all unnecesarry files os.remove(reads2contigs_mapping) os.remove(unmapped_reads_path) os.remove(unmapped_reads_mapping) os.remove(trimmed_sequences_path) os.remove(trimmed_sequences_mapping) return plasmids_with_coverage