def _map_reads(self, fwd_reads, rev_reads, out_prefix, required_flag=None, exclude_flag=None, sort_reads=False, mate_ref=None, no_map_contigs=None): if no_map_contigs is None: no_map_contigs = set() if self.verbose: print(' map reads', fwd_reads, rev_reads, sep='\t') reference = out_prefix + '.ref.fa' self.write_contigs_to_file(reference, do_not_write=no_map_contigs) mapping.map_reads(fwd_reads, rev_reads, reference, out_prefix, index_k=self.map_index_k, index_s=self.map_index_s, threads=self.threads, max_insert=self.max_insert, minid=self.map_minid, verbose=self.verbose, required_flag=required_flag, sort=sort_reads, exclude_flag=exclude_flag) if self.clean: os.unlink(reference) os.unlink(reference + '.fai')
def test_map_reads_wth_flag(self): '''Test map_reads with required flag''' ref = os.path.join(data_dir, 'mapping_test.ref.trimmed.fa') reads_prefix = os.path.join(data_dir, 'mapping_test.reads') out_prefix = 'tmp.out' mapping.map_reads(reads_prefix + '_1.fastq', reads_prefix + '_2.fastq', ref, out_prefix, required_flag=12, verbose=3) expected = get_sam_columns(os.path.join(data_dir, 'mapping_test.smalt.out.flag12.bam')) got = get_sam_columns(out_prefix + '.bam') self.assertListEqual(expected, got) os.unlink(out_prefix + '.bam')
def test_map_reads(self): '''Test mapping reads''' ref = os.path.join(data_dir, 'mapping_test.ref.trimmed.fa') reads_prefix = os.path.join(data_dir, 'mapping_test.reads') out_prefix = 'tmp.out' mapping.map_reads(reads_prefix + '_1.fastq', reads_prefix + '_2.fastq', ref, out_prefix) expected = get_sam_columns(os.path.join(data_dir, 'mapping_test.smalt.out.bam')) got = get_sam_columns(out_prefix + '.bam') self.assertListEqual(expected, got) os.unlink(out_prefix + '.bam')
def test_map_reads_and_sort(self): '''Test mapping reads and sort BAM''' ref = os.path.join(data_dir, 'mapping_test.ref.trimmed.fa') reads_prefix = os.path.join(data_dir, 'mapping_test.reads') out_prefix = 'tmp.out' mapping.map_reads(reads_prefix + '_1.fastq', reads_prefix + '_2.fastq', ref, out_prefix, sort=True, verbose=3) expected = get_sam_columns(os.path.join(data_dir, 'mapping_test.smalt.out.sorted.bam')) got = get_sam_columns(out_prefix + '.bam') self.assertListEqual(expected, got) os.unlink(out_prefix + '.bam') os.unlink(out_prefix + '.bam.bai') os.unlink(out_prefix + '.unsorted.bam')
def _kmc_to_kmer_counts(infile, number, kmers_to_ignore=None, contigs_to_check=None, verbose=0, threads=1): '''Makes a dict of the most common kmers from the kmer counts output file of kmc''' counts = {} if os.path.getsize(infile) == 0: return counts tmpdir = tempfile.mkdtemp(prefix='tmp.common_kmers.', dir=os.getcwd()) ref_seqs_file = os.path.join(tmpdir, 'ref.fa') counts_fasta_file = os.path.join(tmpdir, 'counts.fa') using_refs = _write_ref_seqs_to_be_checked(ref_seqs_file, kmers_to_ignore=kmers_to_ignore, contigs_to_check=contigs_to_check) if not using_refs: if verbose > 2: print('No existing kmers or contigs to check against. Using most common kmer for seed', flush=True) f = pyfastaq.utils.open_file_read(infile) for line in f: if len(counts) >= number: break try: kmer, count = line.rstrip().split() count = int(count) except: raise Error('Error getting kmer info from this line:\n' + line) counts[kmer] = count pyfastaq.utils.close(f) else: if verbose > 2: print('Existing kmers or contigs to check against. Running mapping', flush=True) mapping_prefix = os.path.join(tmpdir, 'map') bam = mapping_prefix + '.bam' _counts_file_to_fasta(infile, counts_fasta_file) mapping.map_reads(counts_fasta_file, None, ref_seqs_file, mapping_prefix, minid=0.9, index_k=9, index_s=1, sort=False, verbose=verbose, required_flag='0x4', threads=threads) sam_reader = pysam.Samfile(bam, "rb") for sam in sam_reader.fetch(until_eof=True): if len(counts) >= number: break try: count = sam.qname.split('_')[1] except: raise Error('Error getting count from sequence name in bam:\n' + sam.qname) nucleotides = common.decode(sam.seq) if nucleotides not in kmers_to_ignore: counts[nucleotides] = count elif verbose >= 4: print('Skipping seed already found:', nucleotides) sam_reader.close() shutil.rmtree(tmpdir) return counts
def _trim_ends(fasta_in, fasta_out, to_trim, min_length=100, min_dist_to_end=25, window_length=10, min_pc=90): '''Trim sequences off contig ends.''' tmpdir = tempfile.mkdtemp(prefix='tmp.adapter_trim.', dir=os.getcwd()) tmp_prefix = os.path.join(tmpdir, 'out') sorted_bam = tmp_prefix + '.bam' mapping.map_reads(to_trim, None, fasta_in, tmp_prefix, index_k=9, index_s=1, threads=1, minid=0.75, sort=True, extra_smalt_map_ops='-d -1 -m 10') f_out = pyfastaq.utils.open_file_write(fasta_out) seq_reader = pyfastaq.sequences.file_reader(fasta_in) for seq in seq_reader: coverage = mapping.get_bam_region_coverage(sorted_bam, seq.id, len(seq), both_strands=True) good_coords = _coverage_to_trimmed_coords( coverage, min_dist_to_end=min_dist_to_end, window_length=window_length, min_pc=min_pc) if good_coords is None: continue seq.seq = seq.seq[good_coords[0]:good_coords[1] + 1] if len(seq) >= min_length: print(seq, file=f_out) pyfastaq.utils.close(f_out) shutil.rmtree(tmpdir)
def process(self): self.tmpdir = tempfile.mkdtemp(prefix='tmp.process_seeds.', dir=os.getcwd()) tmp_prefix = os.path.join(self.tmpdir, 'out') mapping.map_reads(self.reads1, self.reads2, self.seeds_fasta, tmp_prefix, index_k=self.index_k, index_s=self.index_s, threads=self.threads, max_insert=self.max_insert, minid=self.minid, sort=True) self.bam_file = tmp_prefix + '.bam' threads = min(8, self.threads) # to save peak memory going too high threads = self.threads if self.verbose: print('Processing seeds with', threads, 'threads:', list(self.original_seeds.keys())) pool = multiprocessing.Pool(threads) pool.map(self._make_new_seed, list(self.original_seeds.keys())) pool.close() pool.join() if self.verbose: print('... finished processing seeds') new_seeds = {} for seed_name in self.original_seeds: fname = tmp_prefix + '.' + seed_name + '.fa' if os.path.exists(fname): pyfastaq.tasks.file_to_dict(fname, new_seeds) if len(new_seeds) == 0: raise Error('Error! did not make any new seeds. Cannot continue') f = pyfastaq.utils.open_file_write(self.outfile) for seq in new_seeds.values(): print(seq, file=f) pyfastaq.utils.close(f) shutil.rmtree(self.tmpdir)
def _trim_ends(fasta_in, fasta_out, to_trim, min_length=100, min_dist_to_end=25, window_length=10, min_pc=90): '''Trim sequences off contig ends.''' tmpdir = tempfile.mkdtemp(prefix='tmp.adapter_trim.', dir=os.getcwd()) tmp_prefix = os.path.join(tmpdir, 'out') sorted_bam = tmp_prefix + '.bam' mapping.map_reads(to_trim, None, fasta_in, tmp_prefix, index_k=9, index_s=1, threads=1, minid=0.75, sort=True, extra_smalt_map_ops='-d -1 -m 10') f_out = pyfastaq.utils.open_file_write(fasta_out) seq_reader = pyfastaq.sequences.file_reader(fasta_in) for seq in seq_reader: coverage = mapping.get_bam_region_coverage(sorted_bam, seq.id, len(seq), both_strands=True) good_coords = _coverage_to_trimmed_coords(coverage, min_dist_to_end=min_dist_to_end, window_length=window_length, min_pc=min_pc) if good_coords is None: continue seq.seq = seq.seq[good_coords[0]:good_coords[1]+1] if len(seq) >= min_length: print(seq, file=f_out) pyfastaq.utils.close(f_out) shutil.rmtree(tmpdir)
def process(self): self.tmpdir = tempfile.mkdtemp(prefix='tmp.process_seeds.', dir=os.getcwd()) tmp_prefix = os.path.join(self.tmpdir, 'out') mapping.map_reads(self.reads1, self.reads2, self.seeds_fasta, tmp_prefix, index_k = self.index_k, index_s = self.index_s, threads = self.threads, max_insert = self.max_insert, minid = self.minid, sort = True) self.bam_file = tmp_prefix + '.bam' threads = min(8, self.threads) # to save peak memory going too high threads = self.threads if self.verbose: print('Processing seeds with', threads, 'threads:', list(self.original_seeds.keys())) pool = multiprocessing.Pool(threads) pool.map(self._make_new_seed, list(self.original_seeds.keys())) pool.close() pool.join() if self.verbose: print('... finished processing seeds') new_seeds = {} for seed_name in self.original_seeds: fname = tmp_prefix + '.' + seed_name + '.fa' if os.path.exists(fname): pyfastaq.tasks.file_to_dict(fname, new_seeds) if len(new_seeds) == 0: raise Error('Error! did not make any new seeds. Cannot continue') f = pyfastaq.utils.open_file_write(self.outfile) for seq in new_seeds.values(): print(seq, file=f) pyfastaq.utils.close(f) shutil.rmtree(self.tmpdir)