def hmmscan_worker(self, part_file, cmd_line, shitty_output_file, log_file, merged_file_buffer, buffer_write_lock): utils.run_command(cmd_line, log_file) if not os.path.exists(shitty_output_file): self.progress.end() raise ConfigError("Something went wrong with hmmscan and it failed to generate the expected output :/ Fortunately " "we have this log file which should clarify the problem: '%s'. Please do not forget to include this " "file in your question if you were to seek help from the community." % log_file) detected_non_ascii = False lines_with_non_ascii = [] with open(shitty_output_file, 'rb') as hmm_hits_file: line_counter = 0 for line_bytes in hmm_hits_file: line_counter += 1 line = line_bytes.decode('ascii', 'ignore') if not len(line) == len(line_bytes): lines_with_non_ascii.append(line_counter) detected_non_ascii = True if line.startswith('#'): continue with buffer_write_lock: merged_file_buffer.write('\t'.join(line.split()[0:18]) + '\n') if detected_non_ascii: self.run.warning("Just a heads-up, Anvi'o HMMer parser detected non-ascii charachters while processing " "the file '%s' and cleared them. Here are the line numbers with non-ascii charachters: %s. " "You may want to check those lines with a command like \"awk 'NR==<line number>' <file path> | cat -vte\"." % (shitty_output_file, ", ".join(map(str, lines_with_non_ascii))))
def makedb(self, output_db_path=None, dbtype='prot'): if dbtype not in ['prot', 'nucl']: raise ConfigError( "The `makedb` function in `BLAST` does not know about dbtype '%s' :(" % dbtype) self.progress.new('BLAST') self.progress.update( 'creating the search database (using %d thread(s)) ...' % self.num_threads) cmd_line = [ 'makeblastdb', '-in', self.target_fasta, '-dbtype', dbtype, '-out', output_db_path or self.target_fasta ] utils.run_command(cmd_line, self.run.log_file_path) self.progress.end() if dbtype == 'prot': expected_output = (output_db_path or self.target_fasta) + '.phr' elif dbtype == 'nucl': expected_output = (output_db_path or self.target_fasta) + '.nhr' self.check_output(expected_output, 'makeblastdb') self.run.info('blast makeblast cmd', cmd_line, quiet=True) self.run.info('BLAST search db', self.target_fasta)
def run_prodigal(self, fasta_file_path): tmp_dir = filesnpaths.get_temp_directory_path() self.tmp_dirs.append(tmp_dir) self.genes_in_contigs = os.path.join(tmp_dir, 'contigs.genes') self.proteins_in_contigs = os.path.join(tmp_dir, 'contigs.proteins') log_file_path = os.path.join(tmp_dir, '00_log.txt') self.run.warning('', header = 'Finding ORFs in contigs', lc = 'green') self.run.info('Genes', self.genes_in_contigs) self.run.info('Proteins', self.proteins_in_contigs) self.run.info('Log file', log_file_path) self.progress.new('Processing') self.progress.update('Identifying ORFs in contigs ...') cmd_line = ('prodigal -i "%s" -o "%s" -a "%s" -p meta >> "%s" 2>&1' % (fasta_file_path, self.genes_in_contigs, self.proteins_in_contigs, log_file_path)) with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n') utils.run_command(cmd_line) if not os.path.exists(self.proteins_in_contigs): raise ConfigError, "Something went wrong with prodigal, and it failed to generate the\ expected output :/ Fortunately, this log file should tell you what\ might be the problem: '%s'. Please do not forget to include this\ file if you were to ask for help." % log_file_path self.progress.end() return self.proteins_in_contigs
def hmmer_worker(self, partial_input_file, cmd_line, table_output_file, standard_output_file, desired_output, log_file, merged_files_dict): # First we run the command utils.run_command(cmd_line, log_file) if not os.path.exists(table_output_file) or not os.path.exists( standard_output_file): self.progress.end() raise ConfigError( "Something went wrong with %s and it failed to generate the expected output :/ Fortunately " "we have this log file which should clarify the problem: '%s'. Please do not forget to include this " "file in your question if you were to seek help from the community." % (self.program_to_use, log_file)) # Then we append the results to the main file(s) for output in desired_output: main_file_buffer = merged_files_dict[output]['buffer'] main_file_lock = merged_files_dict[output]['lock'] if output == 'table': worker_file = table_output_file append_function = self.append_to_main_table_file elif output == 'standard': worker_file = standard_output_file append_function = self.append_to_main_standard_file append_function(main_file_buffer, worker_file, main_file_lock)
def blastp(self): self.run.info('DIAMOND is set to be', 'Sensitive' if self.sensitive else 'Fast') cmd_line = ['diamond', 'blastp', '-q', self.query_fasta, '-d', self.target_fasta, '-a', self.search_output_path, '-t', self.tmp_dir, '-p', self.num_threads] cmd_line.append('--sensitive') if self.sensitive else None if self.max_target_seqs: cmd_line.extend(['--max-target-seqs', self.max_target_seqs]) if self.evalue: cmd_line.extend(['--evalue', self.evalue]) self.run.info('DIAMOND blastp cmd', ' '.join([str(p) for p in cmd_line]), quiet=(not anvio.DEBUG)) self.progress.new('DIAMOND') self.progress.update('running blastp (using %d thread(s)) ...' % self.num_threads) utils.run_command(cmd_line, self.run.log_file_path) self.progress.end() expected_output = self.search_output_path + '.daa' self.check_output(expected_output, 'blastp') self.run.info('Diamond blastp results', expected_output)
def blastp(self): self.progress.new('BLASTP') self.progress.update('running blastp (using %d thread(s)) ...' % self.num_threads) cmd_line = [ 'blastp', '-query', self.query_fasta, '-db', self.target_db_path, '-evalue', self.evalue, '-outfmt', '6', '-out', self.search_output_path, '-num_threads', self.num_threads ] if self.max_target_seqs: cmd_line += ['-max_target_seqs', self.max_target_seqs] self.run.info('blast blastp cmd', cmd_line, quiet=True) utils.run_command(cmd_line, self.run.log_file_path) self.progress.end() self.check_output(self.search_output_path, 'blastp') if self.names_dict: self.ununique_search_results() self.run.info('BLASTP results', self.search_output_path)
def blast(self): cmd_line = [ self.search_program, '-query', self.query_fasta, '-db', self.target_fasta, '-evalue', self.evalue, '-outfmt', '6', '-out', self.search_output_path, '-num_threads', self.num_threads ] if self.max_target_seqs: cmd_line += ['-max_target_seqs', self.max_target_seqs] self.run.info('NCBI %s cmd' % self.search_program, ' '.join([str(p) for p in cmd_line]), quiet=(not anvio.DEBUG)) self.progress.new('BLAST') self.progress.update( 'running search (using %s with %d thread(s)) ...' % (self.search_program, self.num_threads)) utils.run_command(cmd_line, self.run.log_file_path) self.progress.end() self.check_output(self.search_output_path, self.search_program) if self.names_dict: self.ununique_search_results() self.run.info('BLAST results', self.search_output_path)
def hmmer_worker(self, partial_input_file, cmd_line, table_output_file, standard_output_file, desired_output, log_file, output_queue, ret_value_queue): try: # First we run the command utils.run_command(cmd_line, log_file) if not os.path.exists(table_output_file) or not os.path.exists(standard_output_file): self.progress.end() raise ConfigError("Something went wrong with %s and it failed to generate the expected output :/ Fortunately " "we have this log file which should clarify the problem: '%s'. Please do not forget to include this " "file in your question if you were to seek help from the community." % (self.program_to_use, log_file)) # Then we send the results back to the main thread to be appended to the main files output_dict = {} for output in desired_output: if output == 'table': output_dict['table'] = table_output_file elif output == 'standard': output_dict['standard'] = standard_output_file output_queue.put(output_dict) # return value of 0 to indicate success ret_value_queue.put(0) except Exception as e: # This thread encountered an error. We send the error back to the main thread which # will terminate the job. ret_value_queue.put(e)
def blast(self): cmd_line = [self.search_program, '-query', self.query_fasta, '-db', self.target_fasta, '-evalue', self.evalue, '-outfmt', '6', '-out', self.search_output_path, '-num_threads', self.num_threads] if self.max_target_seqs: cmd_line += ['-max_target_seqs', self.max_target_seqs] self.run.info('NCBI %s cmd' % self.search_program, ' '.join([str(p) for p in cmd_line]), quiet=(not anvio.DEBUG)) self.progress.new('BLAST') self.progress.update('running search (using %s with %d thread(s)) ...' % (self.search_program, self.num_threads)) utils.run_command(cmd_line, self.run.log_file_path) self.progress.end() self.check_output(self.search_output_path, self.search_program) if self.names_dict: self.ununique_search_results() self.run.info('BLAST results', self.search_output_path)
def run_prodigal(self, fasta_file_path): tmp_dir = filesnpaths.get_temp_directory_path() self.tmp_dirs.append(tmp_dir) self.genes_in_contigs = os.path.join(tmp_dir, 'contigs.genes') self.proteins_in_contigs = os.path.join(tmp_dir, 'contigs.proteins') log_file_path = os.path.join(tmp_dir, '00_log.txt') self.run.warning('', header='Finding ORFs in contigs', lc='green') self.run.info('Genes', self.genes_in_contigs) self.run.info('Proteins', self.proteins_in_contigs) self.run.info('Log file', log_file_path) self.progress.new('Processing') self.progress.update('Identifying ORFs in contigs ...') cmd_line = ('prodigal -i "%s" -o "%s" -a "%s" -p meta >> "%s" 2>&1' % (fasta_file_path, self.genes_in_contigs, self.proteins_in_contigs, log_file_path)) with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n') utils.run_command(cmd_line) if not os.path.exists(self.proteins_in_contigs): raise ConfigError, "Something went wrong with prodigal, and it failed to generate the\ expected output :/ Fortunately, this log file should tell you what\ might be the problem: '%s'. Please do not forget to include this\ file if you were to ask for help." % log_file_path self.progress.end() return self.proteins_in_contigs
def blastp(self): self.run.warning(None, header="DIAMOND BLASTP", lc="green") self.run.info("Mode", "Sensitive" if self.sensitive else "Fast") cmd_line = [ 'diamond', 'blastp', '-q', self.query_fasta, '-d', self.target_fasta, '-o', self.tabular_output_path, '-t', self.tmp_dir, '-p', self.num_threads, '--outfmt', *self.outfmt.split() ] cmd_line.append('--sensitive') if self.sensitive else None if self.max_target_seqs: cmd_line.extend(['--max-target-seqs', self.max_target_seqs]) if self.min_pct_id: cmd_line.extend(['--id', self.min_pct_id]) if self.evalue: cmd_line.extend(['--evalue', self.evalue]) self.run.info('DIAMOND blastp cmd', ' '.join([str(p) for p in cmd_line]), quiet=(not anvio.DEBUG)) self.progress.new('DIAMOND') self.progress.update('Running blastp (using %d thread(s)) ...' % self.num_threads) utils.run_command(cmd_line, self.run.log_file_path) self.progress.end() self.run.info('Search results', self.tabular_output_path)
def clusterize(self, parts): # create a 8 digits random identifier for cluster jobs: identifier = ''.join( random.choice(string.ascii_uppercase) for x in range(10)) for part in parts: command = self.command % {'binary': self.binary, 'part': part} # create sh file shell_script = part + '.sh' open(shell_script, 'w').write(QSUB_SCRIPT % { 'log': part + '.log', 'identifier': identifier, 'command': command }) # submit script to cluster utils.run_command('qsub %s' % shell_script) while True: qstat_info = self.get_qstat_info(identifier) total_processes = sum(qstat_info.values()) if total_processes == 0: break self.progress.update( 'Qstat Info :: Total Jobs: %s, %s' % (pp(total_processes), ', '.join( ['%s: %s' % (x, pp(qstat_info[x])) for x in qstat_info]))) time.sleep(5) return True
def blastp(self): self.run.info('DIAMOND is set to be', 'Sensitive' if self.sensitive else 'Fast') cmd_line = [ 'diamond', 'blastp', '-q', self.query_fasta, '-d', self.target_fasta, '-a', self.search_output_path, '-t', self.tmp_dir, '-p', self.num_threads ] cmd_line.append('--sensitive') if self.sensitive else None if self.max_target_seqs: cmd_line.extend(['--max-target-seqs', self.max_target_seqs]) if self.evalue: cmd_line.extend(['--evalue', self.evalue]) self.run.info('DIAMOND blastp cmd', ' '.join([str(p) for p in cmd_line]), quiet=(not anvio.DEBUG)) self.progress.new('DIAMOND') self.progress.update('running blastp (using %d thread(s)) ...' % self.num_threads) utils.run_command(cmd_line, self.run.log_file_path) self.progress.end() expected_output = self.search_output_path + '.daa' self.check_output(expected_output, 'blastp') self.run.info('Diamond blastp results', expected_output)
def view(self): self.progress.new('DIAMOND') self.progress.update('generating tabular output (using %d thread(s)) ...' % self.num_threads) cmd_line = ('diamond view -a %s -o %s -p %d -k 1000000 >> "%s" 2>&1' % (self.search_output_path + '.daa', self.tabular_output_path, self.num_threads, self.run.log_file_path)) self.run.info('diamond view cmd', cmd_line, quiet=True) utils.run_command(cmd_line) self.check_output(self.tabular_output_path, 'view') if self.names_dict: self.run.info('self.names_dict is found', 'Un-uniqueing the tabular output', quiet=True) self.progress.update('Un-uniqueing the tabular output ...') # if we are here, this means the self.tabular_output_path contains information only about unique # entries. We will expand it here so downstream analyses do not need to pay attention to this # detail. utils.ununique_BLAST_tabular_output(self.tabular_output_path, self.names_dict) self.progress.end() self.run.info('Diamond %stabular output file' % ('un-uniqued' if len(self.names_dict) else ''), self.tabular_output_path)
def clusterize(self, parts): # create a 8 digits random identifier for cluster jobs: identifier = ''.join(random.choice(string.ascii_uppercase) for x in range(10)) for part in parts: command = self.command % {'binary': self.binary, 'part': part} # create sh file shell_script = part + '.sh' open(shell_script, 'w').write(QSUB_SCRIPT % {'log': part + '.log', 'identifier': identifier, 'command': command}) # submit script to cluster utils.run_command('qsub %s' % shell_script) while True: qstat_info = self.get_qstat_info(identifier) total_processes = sum(qstat_info.values()) if total_processes == 0: break self.progress.update('Qstat Info :: Total Jobs: %s, %s' % (pp(total_processes), ', '.join(['%s: %s' % (x, pp(qstat_info[x])) for x in qstat_info]))) time.sleep(5) return True
def cluster(self, input_files, args, work_dir, threads=1): J = lambda p: os.path.join(work_dir, p) bin_prefix = J('METABAT_') log_path = J('logs.txt') cmd_line = [self.program_name, '-i', input_files.contigs_fasta, '-a', input_files.contig_coverages, '-o', bin_prefix, '--cvExt', '-l', *utils.serialize_args(args)] self.progress.new(self.program_name) self.progress.update('Running using %d threads...' % threads) utils.run_command(cmd_line, log_path) self.progress.end() output_file_paths = glob.glob(J(bin_prefix + '*')) if not len(output_file_paths): raise ConfigError("Some critical output files are missing. Please take a look at the\ log file: %s" % (log_path)) clusters = {} bin_count = 0 for bin_file in output_file_paths: bin_count += 1 with open(bin_file, 'r') as f: pretty_bin_name = os.path.basename(bin_file).replace('.', '_') clusters[pretty_bin_name] = list(map(str.strip, f.readlines())) return clusters
def process(self, fasta_file_path, output_dir): """Take the fasta file, run prodigal on it, and make sense of the output Returns a gene calls dict, and amino acid sequences dict. """ gene_calls_dict = { } # each entry must contain {'contig', 'start', stop, 'direction', 'partial'} items. amino_acid_sequences_dict = {} self.genes_in_contigs = os.path.join(output_dir, 'contigs.genes') self.amino_acid_sequences_in_contigs = os.path.join( output_dir, 'contigs.amino_acid_sequences') log_file_path = os.path.join(output_dir, '00_log.txt') self.run.warning('', header='Finding ORFs in contigs', lc='green') self.run.info('Genes', self.genes_in_contigs) self.run.info('Amino acid sequences', self.amino_acid_sequences_in_contigs) self.run.info('Log file', log_file_path) self.progress.new('Processing') self.progress.update('Identifying ORFs in contigs ...') cmd_line = [ 'prodigal', '-i', fasta_file_path, '-o', self.genes_in_contigs, '-a', self.amino_acid_sequences_in_contigs, '-p', 'meta' ] utils.run_command(cmd_line, log_file_path) if not os.path.exists(self.amino_acid_sequences_in_contigs): self.progress.end() raise ConfigError( "Something went wrong with prodigal, and it failed to generate the\ expected output :/ Fortunately, this log file should tell you what\ might be the problem: '%s'. Please do not forget to include this\ file if you were to ask for help." % log_file_path) self.progress.update('Processing gene calls ...') fasta = fastalib.SequenceSource(self.amino_acid_sequences_in_contigs) hit_id = 0 while next(fasta): gene_calls_dict[hit_id] = self.parser(fasta.id) amino_acid_sequences_dict[hit_id] = fasta.seq.replace('*', '') hit_id += 1 fasta.close() self.progress.end() self.run.info('Result', 'Prodigal (%s) has identified %d genes.' % (self.installed_version, len(gene_calls_dict)), nl_after=1) return gene_calls_dict, amino_acid_sequences_dict
def cluster(self, input_files, args, work_dir, threads=1, log_file_path=None): J = lambda p: os.path.join(work_dir, p) if not log_file_path: log_file_path = J('logs.txt') translation = { 'preference': 'p', 'maxiter': 'm', 'conviter': 'v', 'damp': 'd', 'contigsize': 'x' } cmd_line = [ self.program_name, '-c', input_files.contig_coverages_log_norm, '-f', os.path.dirname(input_files.contigs_fasta), '-l', os.path.basename(input_files.contigs_fasta), '-o', work_dir, *utils.serialize_args( args, single_dash=True, translate=translation) ] self.progress.new(self.program_name) self.progress.update('Running using %d threads...' % threads) utils.run_command(cmd_line, log_file_path) self.progress.end() output_file_paths = glob.glob(J('*.fna')) if not len(output_file_paths): raise ConfigError( "Some critical output files are missing. Please take a look at the " "log file: %s" % (log_file_path)) clusters = {} bin_count = 0 for bin_file in output_file_paths: bin_count += 1 with open(bin_file, 'r') as f: pretty_bin_name = os.path.basename(bin_file) pretty_bin_name = pretty_bin_name.replace('sequence_', '') pretty_bin_name = pretty_bin_name.replace('.fna', '') pretty_bin_name = pretty_bin_name.replace('-', '_') clusters[pretty_bin_name] = [ line.strip().replace('>', '') for line in f if line.startswith('>') ] return clusters
def process(self, input_path, fasta_files): self.run.info('[sourmash] Kmer size', self.kmer_size, nl_before=1) self.run.info('[sourmash] Compression ratio', self.scale) report_name = 'kmer_%d_mash_similarity' % self.kmer_size # backup the old working directory before changing the directory old_wd = os.getcwd() os.chdir(input_path) if not os.path.exists('output'): os.mkdir('output') else: pass self.progress.new('Sourmash') self.progress.update('Computing fasta signatures for kmer=%d, scale=%d' % (self.kmer_size, self.scale)) scale = '--scaled=%i' % self.scale compute_command = [self.program_name, 'compute', '-k', self.kmer_size, '-f', scale] compute_command.extend(fasta_files) exit_code = utils.run_command(compute_command, self.log_file_path, remove_log_file_if_exists=False) if int(exit_code): self.progress.end() raise ConfigError("sourmash returned with non-zero exit code, there may be some errors.\ Please check the log file `%s` for details. Offending command: \ `%s` ..." % (self.log_file_path, ' '.join([str(x) for x in compute_command[:7]]))) self.progress.update('Computing similarity matrix for kmer=%d, scale=%d' % (self.kmer_size, self.scale)) compare_command = [self.program_name, 'compare', '-k', self.kmer_size, '--csv', os.path.join('output', report_name + '.txt')] for f in fasta_files: compare_command.append(f + ".sig") exit_code = utils.run_command(compare_command, self.log_file_path, remove_log_file_if_exists=False) if int(exit_code): self.progress.end() raise ConfigError("sourmash returned with non-zero exit code, there may be some errors.\ Please check the log file `%s` for details. Offending command: \ `%s` ..." % (self.log_file_path, ' '.join([str(x) for x in compute_command[:7]]))) self.results[report_name] = utils.get_TAB_delimited_file_as_dictionary(os.path.join('output', report_name + '.txt'), indexing_field=-1, separator=',') self.progress.end() # restore old working directory os.chdir(old_wd) return self.results
def process(self, fasta_file_path, output_dir): """Take the fasta file, run prodigal on it, and make sense of the output Returns a gene calls dict, and amino acid sequences dict. """ gene_calls_dict = {} # each entry must contain {'contig', 'start', stop, 'direction', 'partial'} items. amino_acid_sequences_dict = {} self.genes_in_contigs = os.path.join(output_dir, 'contigs.genes') self.amino_acid_sequences_in_contigs = os.path.join(output_dir, 'contigs.amino_acid_sequences') log_file_path = os.path.join(output_dir, '00_log.txt') self.run.warning('', header='Finding ORFs in contigs', lc='green') self.run.info('Genes', self.genes_in_contigs) self.run.info('Amino acid sequences', self.amino_acid_sequences_in_contigs) self.run.info('Log file', log_file_path) self.progress.new('Processing') self.progress.update('Identifying ORFs in contigs ...') cmd_line = ['prodigal', '-i', fasta_file_path, '-o', self.genes_in_contigs, '-a', self.amino_acid_sequences_in_contigs, '-p', 'meta'] utils.run_command(cmd_line, log_file_path) if not os.path.exists(self.amino_acid_sequences_in_contigs): self.progress.end() raise ConfigError("Something went wrong with prodigal, and it failed to generate the\ expected output :/ Fortunately, this log file should tell you what\ might be the problem: '%s'. Please do not forget to include this\ file if you were to ask for help." % log_file_path) if filesnpaths.is_file_empty(self.amino_acid_sequences_in_contigs): self.progress.end() self.run.info('Result', 'Prodigal (%s) has identified no genes :/' % (self.installed_version), nl_after=1, mc="red") return gene_calls_dict, amino_acid_sequences_dict self.progress.update('Processing gene calls ...') fasta = fastalib.SequenceSource(self.amino_acid_sequences_in_contigs) hit_id = 0 while next(fasta): gene_calls_dict[hit_id] = self.parser(fasta.id) amino_acid_sequences_dict[hit_id] = fasta.seq.replace('*', '') hit_id += 1 fasta.close() self.progress.end() self.run.info('Result', 'Prodigal (%s) has identified %d genes.' % (self.installed_version, len(gene_calls_dict)), nl_after=1) return gene_calls_dict, amino_acid_sequences_dict
def check_database(self): """ Checks for the .bin version of database. If it only finds the .pir version, it binarizes it. Sets the db filepath. """ extensionless, extension = os.path.splitext(self.modeller_database) if extension not in [".bin", ".pir", ""]: raise ConfigError( "MODELLER :: The only possible database extensions are .bin and .pir" ) bin_db_path = J(self.database_dir, extensionless + ".bin") pir_db_path = J(self.database_dir, extensionless + ".pir") bin_exists = utils.filesnpaths.is_file_exists(bin_db_path, dont_raise=True) pir_exists = utils.filesnpaths.is_file_exists(pir_db_path, dont_raise=True) self.database_path = bin_db_path if bin_exists: return if not pir_exists and not bin_exists: self.progress.clear() self.run.warning( "Anvi'o looked in {} for a database with the name {} and with an extension \ of either .bin or .pir, but didn't find anything matching that \ criteria. Anvi'o will try and download the best database it knows of from \ https://salilab.org/modeller/downloads/pdb_95.pir.gz and use that. \ You can checkout https://salilab.org/modeller/ for more info about the pdb_95 \ database".format(self.database_dir, self.modeller_database)) db_download_path = os.path.join(self.database_dir, "pdb_95.pir.gz") utils.download_file( "https://salilab.org/modeller/downloads/pdb_95.pir.gz", db_download_path) utils.run_command(['gzip', '-d', db_download_path], log_file_path=filesnpaths.get_temp_file_path()) pir_exists = utils.filesnpaths.is_file_exists(pir_db_path, dont_raise=True) if pir_exists and not bin_exists: self.progress.clear() self.run.warning( "Your database is not in binary format. That means accessing its contents is slower \ than it could be. Anvi'o is going to make a binary format. Just FYI" ) self.run_binarize_database(pir_db_path, bin_db_path) return
def dry_run(self, workflow_graph_output_file_path_prefix='workflow'): """Not your regular dry run. The purpose of this function is to make sure there is a way to check for workflow program dependencies before the workflow is actually run. this way, if there is a `check_workflow_program_dependencies` call at the end of the snake file `get_workflow_snake_file_path(self.name)`, it can be called with a compiled snakemake `workflow` instance.""" if self.slave_mode: return self.progress.new('Bleep bloop') self.progress.update('Quick dry run for an initial sanity check ...') args = ['snakemake', '--snakefile', get_workflow_snake_file_path(self.name), \ '--configfile', self.config_file, '--dryrun', '--quiet'] if self.save_workflow_graph: args.extend(['--dag']) log_file_path = filesnpaths.get_temp_file_path() u.run_command(args, log_file_path) self.progress.end() # here we're getting the graph info from the log file like a dirty hacker # we are (it still may be better to do it elsewhere more appropriate .. so # we can look more decent or whatever): if self.save_workflow_graph: lines = open(log_file_path, 'rU').readlines() try: line_of_interest = [line_no for line_no in range(0, len(lines)) if lines[line_no].startswith('digraph')][0] except IndexError: raise ConfigError("Oh no. Anvi'o was trying to generate a DAG output for you, but something must have\ gone wrong in a step prior. Something tells anvi'o that if you take a look at the\ log file here, you may be able to figure it out: '%s'. Sorry!" % log_file_path) open(workflow_graph_output_file_path_prefix + '.dot', 'w').write(''.join(lines[line_of_interest:])) self.run.info('Workflow DOT file', workflow_graph_output_file_path_prefix + '.dot') if u.is_program_exists('dot', dont_raise=True): dot_log_file = filesnpaths.get_temp_file_path() u.run_command(['dot', '-Tpng', workflow_graph_output_file_path_prefix + '.dot', '-o', workflow_graph_output_file_path_prefix + '.png'], dot_log_file) os.remove(dot_log_file) self.run.info('Workflow PNG file', workflow_graph_output_file_path_prefix + '.png') else: self.run.warning("Well, anvi'o was going to try to save a nice PNG file for your workflow\ graph, but clearly you don't have `dot` installed on your system. That's OK. You\ have your dot file now, and you can Google 'how to view dot file on [your operating\ system goes here]', and install necessary programs (like .. `dot`).") os.remove(log_file_path)
def dry_run(self, workflow_graph_output_file_path_prefix='workflow'): """Not your regular dry run. The purpose of this function is to make sure there is a way to check for workflow program dependencies before the workflow is actually run. this way, if there is a `check_workflow_program_dependencies` call at the end of the snake file `get_workflow_snake_file_path(self.name)`, it can be called with a compiled snakemake `workflow` instance.""" if self.this_workflow_is_inherited_by_another: return self.progress.new('Bleep bloop') self.progress.update('Quick dry run for an initial sanity check ...') args = ['snakemake', '--snakefile', get_workflow_snake_file_path(self.name), \ '--configfile', self.config_file, '--dryrun', '--quiet'] if self.save_workflow_graph: args.extend(['--dag']) log_file_path = filesnpaths.get_temp_file_path() u.run_command(args, log_file_path) self.progress.end() # here we're getting the graph info from the log file like a dirty hacker # we are (it still may be better to do it elsewhere more appropriate .. so # we can look more decent or whatever): if self.save_workflow_graph: lines = open(log_file_path, 'rU').readlines() try: line_of_interest = [line_no for line_no in range(0, len(lines)) if lines[line_no].startswith('digraph')][0] except IndexError: raise ConfigError("Oh no. Anvi'o was trying to generate a DAG output for you, but something must have " "gone wrong in a step prior. Something tells anvi'o that if you take a look at the " "log file here, you may be able to figure it out: '%s'. Sorry!" % log_file_path) open(workflow_graph_output_file_path_prefix + '.dot', 'w').write(''.join(lines[line_of_interest:])) self.run.info('Workflow DOT file', workflow_graph_output_file_path_prefix + '.dot') if u.is_program_exists('dot', dont_raise=True): dot_log_file = filesnpaths.get_temp_file_path() u.run_command(['dot', '-Tpdf', workflow_graph_output_file_path_prefix + '.dot', '-o', workflow_graph_output_file_path_prefix + '.pdf'], dot_log_file) os.remove(dot_log_file) self.run.info('Workflow PDF file', workflow_graph_output_file_path_prefix + '.pdf') else: self.run.warning("Well, anvi'o was going to try to save a nice PDF file for your workflow " "graph, but clearly you don't have `dot` installed on your system. That's OK. You " "have your dot file now, and you can Google 'how to view dot file on [your operating " "system goes here]', and install necessary programs (like .. `dot`).") os.remove(log_file_path)
def process(self, fasta_file_path, output_dir): """Take the fasta file, run prodigal on it, and make sense of the output Returns a gene calls dict, and protein sequences dict. """ gene_calls_dict = {} # each entry must contain {'contig', 'start', stop, 'direction', 'partial'} items. protein_sequences_dict = {} self.genes_in_contigs = os.path.join(output_dir, 'contigs.genes') self.proteins_in_contigs = os.path.join(output_dir, 'contigs.proteins') log_file_path = os.path.join(output_dir, '00_log.txt') self.run.warning('', header = 'Finding ORFs in contigs', lc = 'green') self.run.info('Genes', self.genes_in_contigs) self.run.info('Proteins', self.proteins_in_contigs) self.run.info('Log file', log_file_path) self.progress.new('Processing') self.progress.update('Identifying ORFs in contigs ...') cmd_line = ('prodigal -i "%s" -o "%s" -a "%s" -p meta >> "%s" 2>&1' % (fasta_file_path, self.genes_in_contigs, self.proteins_in_contigs, log_file_path)) with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n') utils.run_command(cmd_line) if not os.path.exists(self.proteins_in_contigs): self.progress.end() raise ConfigError, "Something went wrong with prodigal, and it failed to generate the\ expected output :/ Fortunately, this log file should tell you what\ might be the problem: '%s'. Please do not forget to include this\ file if you were to ask for help." % log_file_path self.progress.update('Processing gene calls ...') fasta = fastalib.SequenceSource(self.proteins_in_contigs) hit_id = 0 while fasta.next(): gene_calls_dict[hit_id] = self.parser(fasta.id) protein_sequences_dict[hit_id] = fasta.seq.replace('*', '') hit_id += 1 fasta.close() self.progress.end() self.run.info('Result', 'Prodigal (%s) has identified %d genes.' % (self.installed_prodigal_version, len(gene_calls_dict)), nl_after = 1) return gene_calls_dict, protein_sequences_dict
def view(self): self.run.warning(None, header="DIAMOND VIEW", lc="green") self.progress.new('DIAMOND') self.progress.update( 'generating tabular output (using %d thread(s)) ...' % self.num_threads) cmd_line = [ 'diamond', 'view', '-a', self.search_output_path + '.daa', '-o', self.tabular_output_path, '-p', self.num_threads, '--outfmt', *self.outfmt.split() ] self.run.info('Command line', ' '.join([str(x) for x in cmd_line]), quiet=True) utils.run_command(cmd_line, self.run.log_file_path) self.check_output(self.tabular_output_path, 'view') if self.names_dict: # if we are here, this means the self.tabular_output_path contains information only about unique # entries. We will expand it here so downstream analyses do not need to pay attention to this # detail. self.run.info('self.names_dict is found', 'Un-uniqueing the tabular output', quiet=True) self.progress.update('Un-uniqueing the tabular output ...') try: int(self.outfmt) except: if not self.outfmt.startswith("6 qseqid sseqid"): self.progress.end() raise ConfigError( "drivers.diamond :: You can't supply a names_dict when running " "view(...) unless your outfmt starts with '6 qseqid sseqid'. Update " "utils.ununique_BLAST_tabular_output to fix this problem. If you're a " "user, please report this on github.") utils.ununique_BLAST_tabular_output(self.tabular_output_path, self.names_dict) self.progress.end() self.run.info( 'Diamond %s tabular output file' % ('un-uniqued' if self.names_dict else ''), self.tabular_output_path)
def makedb(self): self.progress.new('BLAST') self.progress.update('creating the search database (using %d thread(s)) ...' % self.num_threads) cmd_line = ('makeblastdb -in %s -dbtype prot -out %s >> "%s" 2>&1' % (self.query_fasta, self.target_db_path, self.run.log_file_path)) self.run.info('blast makeblast cmd', cmd_line, quiet=True) utils.run_command(cmd_line) self.progress.end() expected_output = self.target_db_path + '.phr' self.check_output(expected_output, 'makeblastdb')
def makedb(self): self.progress.new('BLAST') self.progress.update('creating the search database (using %d thread(s)) ...' % self.num_threads) cmd_line = ('makeblastdb -in %s -dbtype prot -out %s' % (self.query_fasta, self.target_db_path)) utils.run_command(cmd_line, self.run.log_file_path) self.progress.end() expected_output = self.target_db_path + '.phr' self.check_output(expected_output, 'makeblastdb') self.run.info('blast makeblast cmd', cmd_line, quiet=True) self.run.info('BLAST search db', self.target_db_path)
def cluster(self, input_files, args, work_dir, threads=1, log_file_path=None): J = lambda p: os.path.join(work_dir, p) output_file_prefix = J('MAXBIN_') if not log_file_path: log_file_path = J('logs.txt') cmd_line = [ self.program_name, '-contig', input_files.contigs_fasta, '-abund', input_files.contig_coverages, '-out', output_file_prefix, '-thread', str(threads), *utils.serialize_args(args, single_dash=True, use_underscore=True) ] self.progress.new(self.program_name) self.progress.update('Running using %d threads...' % threads) utils.run_command(cmd_line, log_file_path) self.progress.end() output_file_paths = glob.glob(J(output_file_prefix + '*.fasta')) if not len(output_file_paths): raise ConfigError( "Some critical output files are missing. Please take a look at the " "log file: %s" % (log_file_path)) clusters = {} bin_count = 0 for bin_file in output_file_paths: bin_count += 1 with open(bin_file, 'r') as f: bin_name = os.path.basename(bin_file).replace('.fasta', '') bin_name = bin_name.replace('.', '_') clusters[bin_name] = [] for line in f.readlines(): if line.startswith('>'): clusters[bin_name].append(line[1:].strip()) return clusters
def hmmpress_sources(self, sources, tmp_dir): """This function runs hmmpress on the hmm profiles. It returns the locations of each hmmpressed file path in a dictionary keyed by the source. """ hmmpressed_file_paths = {} for source in sources: model_file = sources[source]['model'] hmm_file_path = os.path.join(tmp_dir, source + '.hmm') hmm_file = open(hmm_file_path, 'wb') hmm_file.write(gzip.open(model_file, 'rb').read()) hmm_file.close() log_file_path = log_file_path = os.path.join( tmp_dir, 'hmmpress.log') cmd_line = ['hmmpress', hmm_file_path] ret_val = utils.run_command(cmd_line, log_file_path) hmmpressed_file_paths[source] = hmm_file_path if ret_val: raise ConfigError( "Sadly, anvi'o failed while attempting to compress the HMM model for source %s. You can check out the log file (%s) for " "more detailed information on why this happened." % (source, log_file_path)) return hmmpressed_file_paths
def _command_runner(command, log_file_path): """Run `command`, writing any logs to `log_file_path`. If the command returns a zero exit code, _command_runner returns 0, otherwise it returns `CommandError`. Feel free to override this function. However, if you do, I would suggest that you return CommandError rather than raising the error. This is because _command_runner will be run in its own Thread. Any Exception that is raised in a Thread (or AnviThread) will not bubble up into the calling context. What that means is that if you get an error in your job that's running in the thread, it will just crash the thread but the calling context will happily go on with it's work. This is often NOT what you want. So to handle errors in the calling context, you need to return something like CommandError, and then handle that. See the implementation for _run_commands for an example of how to do this properly. """ try: return_value = utils.run_command(command, log_file_path) except ConfigError as e: # utils.run_command can raise ConfigError. So pass the message from that to a CommandError to keep it # consistent. return CommandError(e.e) if return_value < 0 or return_value > 0: # Technically, utils.run_command will return ConfigError if the return code was < 0, but just do this # sanity check here as well to be sure. return CommandError( f"Failed to run '{command}'. Exit code: {return_value}") else: return return_value
def view(self): self.progress.new('DIAMOND') self.progress.update('generating tabular output (using %d thread(s)) ...' % self.num_threads) cmd_line = ('diamond view -a %s -o %s -p %d >> "%s" 2>&1' % (self.search_output_path + '.daa', self.tabular_output_path, self.num_threads, self.log_file_path)) with open(self.log_file_path, "a") as log: log.write('CMD: ' + cmd_line + '\n') utils.run_command(cmd_line) self.progress.end() self.check_output(self.tabular_output_path, 'view') self.run.info('Diamond tabular output file', self.tabular_output_path)
def makedb(self, output_db_path=None): self.progress.new('BLAST') self.progress.update('creating the search database (using %d thread(s)) ...' % self.num_threads) cmd_line = ['makeblastdb', '-in', self.target_fasta, '-dbtype', 'prot', '-out', output_db_path or self.target_fasta] utils.run_command(cmd_line, self.run.log_file_path) self.progress.end() expected_output = (output_db_path or self.target_fasta) + '.phr' self.check_output(expected_output, 'makeblastdb') self.run.info('blast makeblast cmd', cmd_line, quiet=True) self.run.info('BLAST search db', self.target_fasta)
def cluster(self): self.progress.new('MCL') self.progress.update('clustering (using %d thread(s)) ...' % self.num_threads) cmd_line = ('mcl %s --abc -I %f -o %s -te %d >> "%s" 2>&1' % (self.mcl_input_file_path, self.inflation, self.clusters_file_path, self.num_threads, self.log_file_path)) with open(self.log_file_path, "a") as log: log.write('MCL CMD: ' + cmd_line + '\n') utils.run_command(cmd_line) self.progress.end() self.check_output(self.clusters_file_path, 'makedb') self.run.info('MCL output', self.clusters_file_path)
def makedb(self): self.progress.new('DIAMOND') self.progress.update('creating the search database (using %d thread(s)) ...' % self.num_threads) cmd_line = ('diamond makedb --in %s -d %s -p %d >> "%s" 2>&1' % (self.query_fasta, self.target_db_path, self.num_threads, self.run.log_file_path)) self.run.info('diamond makedb cmd', cmd_line, quiet=True) utils.run_command(cmd_line) self.progress.end() expected_output = self.target_db_path + '.dmnd' self.check_output(expected_output, 'makedb') self.run.info('Diamond temp search db', expected_output)
def makedb(self, output_file_path=None): self.progress.new('DIAMOND') self.progress.update('creating the search database (using %d thread(s)) ...' % self.num_threads) cmd_line = ['diamond', 'makedb', '--in', self.query_fasta, '-d', output_file_path or self.target_fasta, '-p', self.num_threads] utils.run_command(cmd_line, self.run.log_file_path) self.progress.end() expected_output = (output_file_path or self.target_fasta) + '.dmnd' self.check_output(expected_output, 'makedb') self.run.info('diamond makedb cmd', ' '.join([str(x) for x in cmd_line]), quiet=True) self.run.info('Diamond search db', expected_output)
def cluster(self): self.progress.new('MCL') self.progress.update('clustering (using %d thread(s)) ...' % self.num_threads) cmd_line = ( 'mcl %s --abc -I %f -o %s -te %d >> "%s" 2>&1' % (self.mcl_input_file_path, self.inflation, self.clusters_file_path, self.num_threads, self.log_file_path)) with open(self.log_file_path, "a") as log: log.write('MCL CMD: ' + cmd_line + '\n') utils.run_command(cmd_line) self.progress.end() self.check_output(self.clusters_file_path, 'makedb') self.run.info('MCL output', self.clusters_file_path)
def view(self): self.progress.new('DIAMOND') self.progress.update( 'generating tabular output (using %d thread(s)) ...' % self.num_threads) cmd_line = ( 'diamond view -a %s -o %s -p %d >> "%s" 2>&1' % (self.search_output_path + '.daa', self.tabular_output_path, self.num_threads, self.log_file_path)) with open(self.log_file_path, "a") as log: log.write('CMD: ' + cmd_line + '\n') utils.run_command(cmd_line) self.progress.end() self.check_output(self.tabular_output_path, 'view') self.run.info('Diamond tabular output file', self.tabular_output_path)
def blastp(self): self.progress.new('DIAMOND') self.progress.update('running blastp (using %d thread(s)) ...' % self.num_threads) cmd_line = ( 'diamond blastp -q %s -d %s -a %s -t %s -p %d >> "%s" 2>&1' % (self.query_fasta, self.target_db_path, self.search_output_path, self.tmp_dir, self.num_threads, self.log_file_path)) with open(self.log_file_path, "a") as log: log.write('CMD: ' + cmd_line + '\n') utils.run_command(cmd_line) self.progress.end() expected_output = self.search_output_path + '.daa' self.check_output(expected_output, 'blastp') self.run.info('Diamond blastp results', expected_output)
def makedb(self, output_file_path=None): self.progress.new('DIAMOND') self.progress.update('creating the search database (using %d thread(s)) ...' % self.num_threads) # NOTE Question from Evan. Why is the query_fasta the input to the database? cmd_line = ['diamond', 'makedb', '--in', self.query_fasta, '-d', output_file_path or self.target_fasta, '-p', self.num_threads] utils.run_command(cmd_line, self.run.log_file_path) self.progress.end() expected_output = (output_file_path or self.target_fasta) + '.dmnd' self.run.info('diamond makedb cmd', ' '.join([str(x) for x in cmd_line]), quiet=True) self.run.info('Diamond search db', expected_output)
def blastp(self): self.progress.new('BLASTP') self.progress.update('running blastp (using %d thread(s)) ...' % self.num_threads) cmd_line = ( 'blastp -query %s -db %s -evalue %f -outfmt 6 -out %s -num_threads %d >> "%s" 2>&1' % (self.query_fasta, self.target_db_path, self.evalue, self.search_output_path, self.num_threads, self.run.log_file_path)) self.run.info('blast blastp cmd', cmd_line, quiet=True) utils.run_command(cmd_line) self.progress.end() self.check_output(self.search_output_path, 'blastp') self.run.info('BLASTP results', self.search_output_path)
def blastp(self): self.progress.new('BLASTP') self.progress.update('running blastp (using %d thread(s)) ...' % self.num_threads) cmd_line = ('blastp -query %s -db %s -evalue %f -outfmt 6 -out %s -num_threads %d >> "%s" 2>&1' % (self.query_fasta, self.target_db_path, self.evalue, self.search_output_path, self.num_threads, self.run.log_file_path)) self.run.info('blast blastp cmd', cmd_line, quiet=True) utils.run_command(cmd_line) self.progress.end() self.check_output(self.search_output_path, 'blastp') self.run.info('BLASTP results', self.search_output_path)
def blastp(self): self.progress.new('DIAMOND') self.progress.update('running blastp (using %d thread(s)) ...' % self.num_threads) cmd_line = ('diamond blastp -q %s -d %s -a %s -t %s -p %d >> "%s" 2>&1' % (self.query_fasta, self.target_db_path, self.search_output_path, self.tmp_dir, self.num_threads, self.log_file_path)) with open(self.log_file_path, "a") as log: log.write('CMD: ' + cmd_line + '\n') utils.run_command(cmd_line) self.progress.end() expected_output = self.search_output_path + '.daa' self.check_output(expected_output, 'blastp') self.run.info('Diamond blastp results', expected_output)
def run_command(self, query_targets, reference_targets, output_path, run_dir=os.getcwd(), name_conversion_dict=None): """ Run the command Parameters ========== query_targets : string or Path-like The query set of genomes (--ql). It should be a list of filepaths, one per line reference_targets : string or Path-like The reference set of genomes (--rl). It should be a list of filepaths, one per line output_path : string or Path-like Where should the raw fastANI output file be created? Relative to current working directory, not `run_dir` run_dir : string or Path-like, os.getcwd() Where should the command be run? The current directory is the default name_conversion_dict : dict, None The keys of `results` are by default the file paths of the genomes, since that's what fastANI outputs. Pass an optional dictionary with <path>:<name> to convert the output. Note: this effects both the raw output in `output_path` and `results` Returns ======= results : dictionary results dictionary """ self.add_run_info() self.query_names, self.reference_names = self.get_all_query_and_reference_names(query_targets, reference_targets) command = [self.program_name, '--ql', query_targets, '--rl', reference_targets, '-k', self.kmer_size, '--fragLen', self.fragment_length, '--minFrag', self.min_num_fragments, '-t', self.num_threads, '-o', output_path] self.progress.new('fastANI') self.progress.update('Many to Many ...') with utils.RunInDirectory(run_dir): exit_code = utils.run_command(command, self.log_file_path) self.progress.end() if int(exit_code): raise ConfigError("fastANI returned with non-zero exit code, there may be some errors. \ please check the log file for details.") self.fastANI_output = self.load_output_as_dataframe(output_path, name_conversion_dict) utils.store_dataframe_as_TAB_delimited_file(self.fastANI_output, output_path) self.results = self.gen_results_dict() return self.results
def cluster(self): self.run.info('MCL inflation', self.inflation) self.progress.new('MCL') self.progress.update('clustering (using %d thread(s)) ...' % self.num_threads) cmd_line = ('mcl %s --abc -I %f -o %s -te %d >> "%s" 2>&1' % (self.mcl_input_file_path, self.inflation, self.clusters_file_path, self.num_threads, self.run.log_file_path)) self.run.info('mcl cmd', cmd_line, quiet = True) utils.run_command(cmd_line) self.progress.end() self.check_output(self.clusters_file_path, 'makedb') self.run.info('MCL output', self.clusters_file_path)
def cluster(self): self.run.info('MCL inflation', self.inflation) self.progress.new('MCL') self.progress.update('clustering (using %d thread(s)) ...' % self.num_threads) cmd_line = ( 'mcl %s --abc -I %f -o %s -te %d >> "%s" 2>&1' % (self.mcl_input_file_path, self.inflation, self.clusters_file_path, self.num_threads, self.run.log_file_path)) self.run.info('mcl cmd', cmd_line, quiet=True) utils.run_command(cmd_line) self.progress.end() self.check_output(self.clusters_file_path, 'makedb') self.run.info('MCL output', self.clusters_file_path)
def check_database(self): """ Checks for the .bin version of database. If it only finds the .pir version, it binarizes it. Sets the db filepath. """ extensionless, extension = os.path.splitext(self.modeller_database) if extension not in [".bin",".pir",""]: raise ConfigError("MODELLER :: The only possible database extensions are .bin and .pir") bin_db_path = J(self.database_dir, extensionless+".bin") pir_db_path = J(self.database_dir, extensionless+".pir") bin_exists = utils.filesnpaths.is_file_exists(bin_db_path, dont_raise=True) pir_exists = utils.filesnpaths.is_file_exists(pir_db_path, dont_raise=True) self.database_path = bin_db_path if bin_exists: return if not pir_exists and not bin_exists: self.progress.clear() self.run.warning("Anvi'o looked in {} for a database with the name {} and with an extension \ of either .bin or .pir, but didn't find anything matching that \ criteria. We'll try and download the best database we know of from \ https://salilab.org/modeller/downloads/pdb_95.pir.gz and use that. \ You can checkout https://salilab.org/modeller/ for more info about the pdb_95 \ database".format(self.database_dir, self.modeller_database)) db_download_path = os.path.join(self.database_dir, "pdb_95.pir.gz") utils.download_file("https://salilab.org/modeller/downloads/pdb_95.pir.gz", db_download_path) utils.run_command(['gzip', '-d', db_download_path], log_file_path=filesnpaths.get_temp_file_path()) pir_exists = utils.filesnpaths.is_file_exists(pir_db_path, dont_raise=True) if pir_exists and not bin_exists: self.progress.clear() self.run.warning("Your database is not in binary format. That means accessing its contents is slower \ than it could be. Anvi'o is going to make a binary format. Just FYI") self.run_binarize_database(pir_db_path, bin_db_path) return
def blastp(self): self.run.info('DIAMOND is set to be', 'Sensitive' if self.sensitive else 'Fast') self.progress.new('DIAMOND') self.progress.update('running blastp (using %d thread(s)) ...' % self.num_threads) cmd_line = ('diamond blastp -q %s -d %s -a %s -t %s -p %d %s -k 1000000 >> "%s" 2>&1' % (self.query_fasta, self.target_db_path, self.search_output_path, self.tmp_dir, self.num_threads, '--sensitive' if self.sensitive else '', self.run.log_file_path)) self.run.info('diamond blastp cmd', cmd_line, quiet=True) utils.run_command(cmd_line) self.progress.end() expected_output = self.search_output_path + '.daa' self.check_output(expected_output, 'blastp') self.run.info('Diamond blastp results', expected_output)
def run_command(self, input_path): # backup the old working directory before changing the directory old_wd = os.getcwd() os.chdir(input_path) full_command = [self.program_name, '--outdir', 'output', '--indir', input_path, '--method', self.method, '--workers', self.num_threads] self.progress.new('PyANI') self.progress.update('Running ...') exit_code = utils.run_command(full_command, self.log_file_path) self.progress.end() if int(exit_code): raise ConfigError("PyANI returned with non-zero exit code, there may be some errors. \ please check the log file for details.") output_matrix_names = ['alignment_coverage', 'alignment_lengths', 'hadamard', \ 'percentage_identity', 'similarity_errors', 'correlations'] full_matrix_path = lambda name: os.path.join(input_path, 'output', self.method + '_' + name + '.tab') matrices = {} for matrix_name in output_matrix_names: output_matrix_path = full_matrix_path(matrix_name) if os.path.exists(output_matrix_path): matrices[matrix_name] = utils.get_TAB_delimited_file_as_dictionary(output_matrix_path, empty_header_columns_are_OK=True) if not len(matrices): raise ConfigError("None of the output matrices pyANI was supposed to generate was found in the\ output directory :( You may find some clues in the log file?") self.run.info_single("Output matrices for the following items are stored in the output\ directory: %s <success kid meme.png>." % \ (', '.join(["'%s'" % m.replace('_', ' ') for m in matrices])), nl_before=1, mc='green') # restore old working directory os.chdir(old_wd) return matrices
def run_hmmscan(self, source, alphabet, context, kind, domain, num_genes_in_model, hmm, ref, noise_cutoff_terms): target = ':'.join([alphabet, context]) if target not in self.target_files_dict: raise ConfigError("You have an unknown target :/ Target, which defines an alphabet and context\ to clarify whether the HMM search is supposed to be done using alphabets DNA,\ RNA, or AA sequences, and contexts of GENEs or CONTIGs. Yours is %s, and it\ doesn't work for anvi'o." % target) if not self.target_files_dict[target]: raise ConfigError("HMMer class does not know about Sequences file for the target %s :/" % target) self.run.warning('', header='HMM Profiling for %s' % source, lc='green') self.run.info('Reference', ref if ref else 'unknown') self.run.info('Kind', kind if kind else 'unknown') self.run.info('Alphabet', alphabet) self.run.info('Context', context) self.run.info('Domain', domain if domain else 'N\\A') self.run.info('HMM model path', hmm) self.run.info('Number of genes', num_genes_in_model) self.run.info('Noise cutoff term(s)', noise_cutoff_terms) self.run.info('Number of CPUs will be used for search', self.num_threads_to_use) tmp_dir = filesnpaths.get_temp_directory_path() self.tmp_dirs.append(tmp_dir) self.hmm_scan_output = os.path.join(tmp_dir, 'hmm.output') self.hmm_scan_hits = os.path.join(tmp_dir, 'hmm.hits') self.hmm_scan_hits_shitty = os.path.join(tmp_dir, 'hmm.hits.shitty') log_file_path = os.path.join(tmp_dir, '00_log.txt') self.run.info('Temporary work dir', tmp_dir) self.run.info('HMM scan output', self.hmm_scan_output) self.run.info('HMM scan hits', self.hmm_scan_hits) self.run.info('Log file', log_file_path) self.progress.new('Unpacking the model into temporary work directory') self.progress.update('...') hmm_file_path = os.path.join(tmp_dir, 'hmm.txt') hmm_file = open(hmm_file_path, 'wb') hmm_file.write(gzip.open(hmm, 'rb').read()) hmm_file.close() self.progress.end() self.progress.new('Processing') self.progress.update('Compressing the pfam model') cmd_line = ['hmmpress', hmm_file_path] ret_val = utils.run_command(cmd_line, log_file_path) if ret_val: raise ConfigError("The last call did not work quite well. Most probably the version of HMMER you have\ installed is either not up-to-date enough, or too new :/ Just to make sure what went\ wrong please take a look at the log file ('%s'). Please visit %s to see what\ is the latest version availalbe if you think updating HMMER can resolve it. You can\ learn which version of HMMER you have on your system by typing 'hmmpress -h'."\ % (log_file_path, 'http://hmmer.janelia.org/download.html')) self.progress.end() self.progress.new('Processing') self.progress.update('Performing HMM scan ...') cmd_line = ['nhmmscan' if alphabet in ['DNA', 'RNA'] else 'hmmscan', '-o', self.hmm_scan_output, *noise_cutoff_terms.split(), '--cpu', self.num_threads_to_use, '--tblout', self.hmm_scan_hits_shitty, hmm_file_path, self.target_files_dict[target]] utils.run_command(cmd_line, log_file_path) if not os.path.exists(self.hmm_scan_hits_shitty): self.progress.end() raise ConfigError("Something went wrong with hmmscan, and it failed to generate the\ expected output :/ Fortunately, this log file should tell you what\ might be the problem: '%s'. Please do not forget to include this\ file if you were to ask for help." % log_file_path) self.progress.end() # thank you, hmmscan, for not generating a simple TAB-delimited, because we programmers # love to write little hacks like this into our code: parseable_output = open(self.hmm_scan_hits, 'w') detected_non_ascii = False lines_with_non_ascii = [] with open(self.hmm_scan_hits_shitty, 'rb') as hmm_hits_file: line_counter = 0 for line_bytes in hmm_hits_file: line_counter += 1 line = line_bytes.decode('ascii', 'ignore') if not len(line) == len(line_bytes): lines_with_non_ascii.append(line_counter) detected_non_ascii = True if line.startswith('#'): continue parseable_output.write('\t'.join(line.split()[0:18]) + '\n') parseable_output.close() if detected_non_ascii: self.run.warning("Just a heads-up, Anvi'o HMMer parser detected non-ascii charachters while processing \ the file '%s' and cleared them. Here are the line numbers with non-ascii charachters: %s.\ You may want to check those lines with a command like \"awk 'NR==<line number>' <file path> | cat -vte\"." % (self.hmm_scan_hits_shitty, ", ".join(map(str, lines_with_non_ascii)))) num_raw_hits = filesnpaths.get_num_lines_in_file(self.hmm_scan_hits) self.run.info('Number of raw hits', num_raw_hits) return self.hmm_scan_hits if num_raw_hits else None
def process(self, output_dir, drop_previous_annotations=False): """Takes an anvi'o contigs database, and does its magic. Which involves exporting amino acid sequences for gene calls, running emapper.py on them,\ parsing the output, and storing the results in the contigs database. """ if not self.contigs_db_path: raise ConfigError("EggNOGMapper::process() is speaking: you can't really call this function if you inherited\ this class without a contigs database path :/ What are you doing?") filesnpaths.is_output_dir_writable(output_dir) contigs_db = dbops.ContigsDatabase(self.contigs_db_path) if not contigs_db.meta['genes_are_called']: raise ConfigError("It seems genes were not called for this contigs database (%s). This is a\ total no-no since we will need them to get amino acid seqeunces for functional\ annotationd :/" % self.contigs_db_path) aa_sequences_list = contigs_db.db.get_table_as_list_of_tuples(t.gene_amino_acid_sequences_table_name) num_aa_sequences = len(aa_sequences_list) contigs_db.disconnect() # change the current work directory work_dir = os.getcwd() os.chdir(output_dir) self.run.info('Work directory for temporary files', output_dir) self.run.info('Num threads to use', self.num_threads) self.run.info('Target database', self.database, mc='red') self.run.info('Use memomory', self.usemem) self.run.info('Genes found', num_aa_sequences, mc='green') self.run.info('AA sequences', self.aa_sequences_file_name) self.progress.new('Processing') self.progress.update('Storing gene sequences ...') aa_sequences_fp = open(self.aa_sequences_file_name, 'w') for gene_callers_id, aa_sequence in aa_sequences_list: aa_sequences_fp.write('>%s%d\n%s\n' % (self.gene_caller_id_prefix, gene_callers_id, aa_sequence)) aa_sequences_fp.close() del aa_sequences_list cmd_line = [self.executable, '-i', self.aa_sequences_file_name, '--output', self.output_file_prefix] # num threads cmd_line.extend(['--cpu', self.num_threads]) if self.num_threads else None # usemem cmd_line.extend(['--usemem']) if self.usemem else None # database cmd_line.extend(['--database', self.database]) self.progress.update('Running eggnog-mapper on %d sequences. This may take a while ...' % num_aa_sequences) utils.run_command(cmd_line, self.log_file_path) if not os.path.exists(self.annotations_file_name): self.progress.end() raise ConfigError("Something went wrong with eggnog-mapper :( The annotations file is not where it is supposed to be.\ If you are lucky, this log file will have enough output information for you to make sense of\ what went wrong: '%s'. Due to this error, the output directory will be kept as is, and you\ will have to remove it manually. Sorry about the inconvenience! Anvi'o developers know how much\ it sucks when things just don't work." % os.path.join(output_dir, self.log_file_path)) self.progress.end() # we are done, and the annotations file is there. self.populate_annotations_dict(os.path.join(output_dir, self.annotations_file_name)) os.chdir(work_dir) # alright. store annotations into the database self.store_annotations_in_db(drop_previous_annotations=drop_previous_annotations)
def process(self, fasta_file_path, output_dir): """Take the fasta file, run prodigal on it, and make sense of the output Returns a gene calls dict, and amino acid sequences dict. """ gene_calls_dict = {} # each entry must contain {'contig', 'start', stop, 'direction', 'partial'} items. amino_acid_sequences_dict = {} self.genes_in_contigs = os.path.join(output_dir, 'contigs.genes') self.amino_acid_sequences_in_contigs = os.path.join(output_dir, 'contigs.amino_acid_sequences') log_file_path = os.path.join(output_dir, '00_log.txt') self.run.warning('', header='Finding ORFs in contigs', lc='green') self.run.info('Genes', self.genes_in_contigs) self.run.info('Amino acid sequences', self.amino_acid_sequences_in_contigs) self.run.info('Log file', log_file_path) cmd_line = ['prodigal', '-i', fasta_file_path, '-o', self.genes_in_contigs, '-a', self.amino_acid_sequences_in_contigs] if self.prodigal_translation_table: cmd_line.extend(['-g', self.prodigal_translation_table]) self.run.warning("Prodigal translation table is set to '%s' (whatever you did has worked so far, but\ keep an eye for errors from prodigal in case it doesn't like your translation table\ parameter). This means we will not use prodigal in the metagenomics mode, due to this\ issue: https://github.com/hyattpd/Prodigal/issues/19. If that issue is closed, and you\ are reading this message, then please contact an anvi'o developer." % str(self.prodigal_translation_table)) else: cmd_line.extend(['-p', 'meta']) self.progress.new('Processing') self.progress.update('Identifying ORFs in contigs ...') utils.run_command(cmd_line, log_file_path) if not os.path.exists(self.amino_acid_sequences_in_contigs): self.progress.end() raise ConfigError("Something went wrong with prodigal, and it failed to generate the\ expected output :/ Fortunately, this log file should tell you what\ might be the problem: '%s'. Please do not forget to include this\ file if you were to ask for help." % log_file_path) if filesnpaths.is_file_empty(self.amino_acid_sequences_in_contigs): self.progress.end() self.run.info('Result', 'Prodigal (%s) has identified no genes :/' % (self.installed_version), nl_after=1, mc="red") return gene_calls_dict, amino_acid_sequences_dict self.progress.update('Processing gene calls ...') fasta = fastalib.SequenceSource(self.amino_acid_sequences_in_contigs) hit_id = 0 while next(fasta): gene_calls_dict[hit_id] = self.parser(fasta.id) amino_acid_sequences_dict[hit_id] = fasta.seq.replace('*', '') hit_id += 1 fasta.close() self.progress.end() self.run.info('Result', 'Prodigal (%s) has identified %d genes.' % (self.installed_version, len(gene_calls_dict)), nl_after=1) return gene_calls_dict, amino_acid_sequences_dict
def run_hmmscan(self, source, genes_in_model, hmm, ref, cut_off_flag = "--cut_ga"): self.run.warning('', header = 'HMM Profiling for %s' % source, lc = 'green') self.run.info('Reference', ref if ref else 'unknown') self.run.info('Pfam model', hmm) self.run.info('Number of genes', len(genes_in_model)) tmp_dir = filesnpaths.get_temp_directory_path() self.tmp_dirs.append(tmp_dir) self.hmm_scan_output = os.path.join(tmp_dir, 'hmm.output') self.hmm_scan_hits = os.path.join(tmp_dir, 'hmm.hits') self.hmm_scan_hits_shitty = os.path.join(tmp_dir, 'hmm.hits.shitty') log_file_path = os.path.join(tmp_dir, '00_log.txt') self.run.info('Temporary work dir', tmp_dir) self.run.info('HMM scan output', self.hmm_scan_output) self.run.info('HMM scan hits', self.hmm_scan_hits) self.run.info('Log file', log_file_path) self.progress.new('Unpacking the model into temporary work directory') self.progress.update('...') hmm_file_path = os.path.join(tmp_dir, 'hmm.txt') hmm_file = open(hmm_file_path, 'w') hmm_file.write(gzip.open(hmm, 'rb').read()) hmm_file.close() self.progress.end() self.progress.new('Processing') self.progress.update('Compressing the pfam model') cmd_line = ('hmmpress "%s" >> "%s" 2>&1' % (hmm_file_path, log_file_path)) with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n') utils.run_command(cmd_line) self.progress.end() self.progress.new('Processing') self.progress.update('Performing HMM scan ...') cmd_line = ('hmmscan -o "%s" %s --tblout "%s" "%s" "%s" >> "%s" 2>&1' % (self.hmm_scan_output, cut_off_flag, self.hmm_scan_hits_shitty, hmm_file_path, self.proteins_in_contigs, log_file_path)) with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n') utils.run_command(cmd_line) if not os.path.exists(self.hmm_scan_hits_shitty): raise ConfigError, "Something went wrong with hmmscan, and it failed to generate the\ expected output :/ Fortunately, this log file should tell you what\ might be the problem: '%s'. Please do not forget to include this\ file if you were to ask for help." % log_file_path self.progress.end() # thank you, hmmscan, for not generating a simple TAB-delimited, because we programmers # love to write little hacks like this into our code: parseable_output = open(self.hmm_scan_hits, 'w') for line in open(self.hmm_scan_hits_shitty).readlines(): if line.startswith('#'): continue parseable_output.write('\t'.join(line.split()[0:18]) + '\n') parseable_output.close() num_raw_hits = filesnpaths.get_num_lines_in_file(self.hmm_scan_hits) self.run.info('Number of raw hits', num_raw_hits) return self.hmm_scan_hits if num_raw_hits else None
def run_hmmscan(self, source, genes_in_model, hmm, ref, cut_off_flag = "--cut_ga"): self.run.warning('', header = 'HMM Profiling for %s' % source, lc = 'green') self.run.info('Reference', ref if ref else 'unknown') self.run.info('Pfam model', hmm) self.run.info('Number of genes', len(genes_in_model)) self.run.info('Number of CPUs will be used for search', self.num_threads_to_use) tmp_dir = filesnpaths.get_temp_directory_path() self.tmp_dirs.append(tmp_dir) self.hmm_scan_output = os.path.join(tmp_dir, 'hmm.output') self.hmm_scan_hits = os.path.join(tmp_dir, 'hmm.hits') self.hmm_scan_hits_shitty = os.path.join(tmp_dir, 'hmm.hits.shitty') log_file_path = os.path.join(tmp_dir, '00_log.txt') self.run.info('Temporary work dir', tmp_dir) self.run.info('HMM scan output', self.hmm_scan_output) self.run.info('HMM scan hits', self.hmm_scan_hits) self.run.info('Log file', log_file_path) self.progress.new('Unpacking the model into temporary work directory') self.progress.update('...') hmm_file_path = os.path.join(tmp_dir, 'hmm.txt') hmm_file = open(hmm_file_path, 'w') hmm_file.write(gzip.open(hmm, 'rb').read()) hmm_file.close() self.progress.end() self.progress.new('Processing') self.progress.update('Compressing the pfam model') cmd_line = ('hmmpress "%s" >> "%s" 2>&1' % (hmm_file_path, log_file_path)) with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n') ret_val = utils.run_command(cmd_line) if ret_val: raise ConfigError, "The last call did not work quite well. Most probably the version of HMMER\ you have installed is not up-to-date enough. Just to make sure what went\ wrong please take a look at the log file ('%s'). Please visit %s to see what\ is the latest version availalbe. You can learn which version of HMMER you have\ on your system by typing 'hmmpress -h'"\ % (log_file_path, 'http://hmmer.janelia.org/download.html') self.progress.end() self.progress.new('Processing') self.progress.update('Performing HMM scan ...') cmd_line = ('hmmscan -o "%s" %s --cpu %d --tblout "%s" "%s" "%s" >> "%s" 2>&1' \ % (self.hmm_scan_output, cut_off_flag, self.num_threads_to_use, self.hmm_scan_hits_shitty, hmm_file_path, self.protein_sequences_fasta, log_file_path)) with open(log_file_path, "a") as myfile: myfile.write('CMD: ' + cmd_line + '\n') utils.run_command(cmd_line) if not os.path.exists(self.hmm_scan_hits_shitty): raise ConfigError, "Something went wrong with hmmscan, and it failed to generate the\ expected output :/ Fortunately, this log file should tell you what\ might be the problem: '%s'. Please do not forget to include this\ file if you were to ask for help." % log_file_path self.progress.end() # thank you, hmmscan, for not generating a simple TAB-delimited, because we programmers # love to write little hacks like this into our code: parseable_output = open(self.hmm_scan_hits, 'w') for line in open(self.hmm_scan_hits_shitty).readlines(): if line.startswith('#'): continue parseable_output.write('\t'.join(line.split()[0:18]) + '\n') parseable_output.close() num_raw_hits = filesnpaths.get_num_lines_in_file(self.hmm_scan_hits) self.run.info('Number of raw hits', num_raw_hits) return self.hmm_scan_hits if num_raw_hits else None