def _total_alignment_score(self, seq_name): tmpdir = tempfile.mkdtemp(prefix='tmp.get_total_aln_score.', dir=os.getcwd()) tmp_bam = os.path.join(tmpdir, 'tmp.get_total_alignment_score.bam') tmp_fa = os.path.join(tmpdir, 'tmp.get_total_alignment_score.ref.fa') faidx.write_fa_subset( [seq_name], self.references_fa, tmp_fa, samtools_exe=self.samtools_exe, verbose=True, verbose_filehandle=self.log_fh ) mapping.run_bowtie2( self.reads1, self.reads2, tmp_fa, tmp_bam[:-4], threads=self.threads, samtools=self.samtools_exe, bowtie2=self.bowtie2_exe, bowtie2_preset=self.bowtie2_preset, verbose=True, verbose_filehandle=self.log_fh ) score = mapping.get_total_alignment_score(tmp_bam) shutil.rmtree(tmpdir) return score
def _get_total_alignment_score(self, gene_name): tmp_bam = os.path.join(self.root_dir, 'tmp.get_total_alignment_score.bam') assert not os.path.exists(tmp_bam) tmp_fa = os.path.join(self.root_dir, 'tmp.get_total_alignment_score.ref.fa') assert not os.path.exists(tmp_fa) faidx.write_fa_subset([gene_name], self.genes_fa, tmp_fa, samtools_exe=self.samtools_exe, verbose=self.verbose) mapping.run_bowtie2( self.reads1, self.reads2, tmp_fa, tmp_bam[:-4], threads=self.threads, samtools=self.samtools_exe, bowtie2=self.bowtie2_exe, bowtie2_preset=self.bowtie2_preset, verbose=self.verbose, ) score = mapping.get_total_alignment_score(tmp_bam) os.unlink(tmp_bam) os.unlink(tmp_fa) os.unlink(tmp_fa + '.fai') return score
def test_write_fa_subset(self): '''test write_fa_subset''' infile = os.path.join(data_dir, 'faidx_test_write_fa_subset.in.fa') expected = os.path.join(data_dir, 'faidx_test_write_fa_subset.out.fa') tmpfile = 'tmp.test_write_fa_subset.out.fa' faidx.write_fa_subset(['seq1', 'seq3', 'seq4'], infile, tmpfile) self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False)) os.unlink(tmpfile)
def _choose_best_gene(self): gene_name = self._get_best_gene_by_alignment_score() if gene_name is None: return None faidx.write_fa_subset([gene_name], self.genes_fa, self.gene_fa, samtools_exe=self.samtools_exe, verbose=self.verbose) seqs = {} pyfastaq.tasks.file_to_dict(self.gene_fa, seqs) assert len(seqs) == 1 return list(seqs.values())[0]
def best_seq(self, outfile): '''Finds the closest matchng sequence, writes it to a FASTA file, and returns it as a pyfastaq.sequences.Fasta object''' seq_name = self._get_best_seq_by_alignment_score() if seq_name is None: return None faidx.write_fa_subset([seq_name], self.references_fa, outfile, samtools_exe=self.samtools_exe, verbose=True, verbose_filehandle=self.log_fh) seqs = {} pyfastaq.tasks.file_to_dict(outfile, seqs) assert len(seqs) == 1 return list(seqs.values())[0]
def _init_and_run_clusters(self): if len(self.cluster_to_dir) == 0: raise Error('Did not get any reads mapped to genes. Cannot continue') counter = 0 for gene in sorted(self.cluster_to_dir): counter += 1 if self.verbose: print('\nAssembling cluster', counter, 'of', str(len(self.cluster_to_dir))) new_dir = self.cluster_to_dir[gene] faidx.write_fa_subset( self.cluster_ids[gene], self.db_fasta, os.path.join(new_dir, 'genes.fa'), samtools_exe=self.samtools_exe, verbose=self.verbose ) self.clusters[gene] = cluster.Cluster( new_dir, gene, assembly_kmer=self.assembly_kmer, assembler=self.assembler, max_insert=self.insert_proper_pair_max, min_scaff_depth=self.min_scaff_depth, nucmer_min_id=self.nucmer_min_id, nucmer_min_len=self.nucmer_min_len, nucmer_breaklen=self.nucmer_breaklen, sspace_k=self.min_scaff_depth, reads_insert=self.insert_size, sspace_sd=self.insert_sspace_sd, threads=self.threads, assembled_threshold=self.assembled_threshold, unique_threshold=self.unique_threshold, verbose=self.verbose, bcftools_exe=self.bcftools_exe, gapfiller_exe=self.gapfiller_exe, samtools_exe=self.samtools_exe, bowtie2_exe=self.bowtie2_exe, bowtie2_preset=self.bowtie2_preset, spades_exe=self.spades_exe, sspace_exe=self.sspace_exe, velvet_exe=self.velvet, spades_other=self.spades_other, clean=self.clean, ) self.clusters[gene].run()
def run(self): self._assemble_with_fermilite() self.sequences = {} # double-check we got some contigs number_of_contigs = pyfastaq.tasks.count_sequences(self.assembly_contigs) if os.path.exists(self.assembly_contigs) else 0 if number_of_contigs == 0: self.assembled_ok = False # This is to make this object picklable, to keep multithreading happy self.log_fh = None return else: self.assembled_ok = True if self.assembled_ok: self._scaffold_with_sspace() self._gap_fill_with_gapfiller() pyfastaq.tasks.filter(self.gapfilled_scaffolds, self.gapfilled_length_filtered, minlength=self.min_scaff_length) if pyfastaq.tasks.count_sequences(self.gapfilled_length_filtered) == 0: self.assembled_ok = False # This is to make this object picklable, to keep multithreading happy self.log_fh = None return masher = mash.Masher(self.ref_fastas, self.gapfilled_length_filtered, self.log_fh, self.extern_progs) self.ref_seq_name = masher.run(self.mash_dist_file) if self.ref_seq_name is None: print('Could not determine closest reference sequence', file=self.log_fh) self.log_fh = None return faidx.write_fa_subset({self.ref_seq_name}, self.ref_fastas, self.ref_fasta, samtools_exe=self.extern_progs.exe('samtools'), verbose=True, verbose_filehandle=self.log_fh) print('Closest reference sequence according to mash: ', self.ref_seq_name, file=self.log_fh) contigs_both_strands = self._fix_contig_orientation(self.gapfilled_length_filtered, self.ref_fasta, self.final_assembly_fa, min_id=self.nucmer_min_id, min_length=self.nucmer_min_len, breaklen=self.nucmer_breaklen) self.has_contigs_on_both_strands = len(contigs_both_strands) > 0 pyfastaq.tasks.file_to_dict(self.final_assembly_fa, self.sequences) mapping.run_bowtie2( self.reads1, self.reads2, self.final_assembly_fa, self.final_assembly_bam[:-4], threads=1, sort=True, samtools=self.extern_progs.exe('samtools'), bowtie2=self.extern_progs.exe('bowtie2'), verbose=True, verbose_filehandle=self.log_fh ) self.scaff_graph_ok = self._parse_bam(self.sequences, self.final_assembly_bam, self.min_scaff_depth, self.max_insert) print('Scaffolding graph is OK:', self.scaff_graph_ok, file=self.log_fh) if self.clean: for suffix in ['soft_clipped', 'unmapped_mates', 'scaff']: filename = self.final_assembly_bam + '.' + suffix print('Deleting file', filename, file=self.log_fh) os.unlink(filename) # This is to make this object picklable, to keep multithreading happy self.log_fh = None