def _archive_db(self, old_db_file): ''' Archive an old database file Parameters ---------- old_db_file - String. File name of old database file to archive ''' old_database_path = os.path.join(self.DATABASE_DIR, "old") if not os.path.isdir(old_database_path): logging.info('Creating directory to store databases: %s' % (old_database_path)) os.makedirs(old_database_path) old_db_path_archive \ = os.path.join(old_database_path, old_db_file + self.ARCHIVE_SUFFIX) old_db_path \ = os.path.join(self.DATABASE_DIR, old_db_file) logging.info('Compressing old database') cmd = "tar -cvzf %s %s > /dev/null" % (old_db_path_archive, old_db_path) run_command(cmd) logging.info('Cleaning up') shutil.rmtree(old_db_path)
def diamond_search(self, tmp_name, output_path, database): ''' Carry out a diamond blastp search. Parameters ---------- input_genome_path - string. Path to file containing .faa file for an input genome output_path - string. Path to file to output results into databases - string. Path to HMM to use for searching ''' cmd = f'bash {tmp_name} | diamond blastp \ --quiet \ --outfmt 6 \ --max-target-seqs 1 \ --query /dev/stdin \ --out {output_path} \ --db {database} \ --threads {self.threads} ' if self.evalue: cmd += f'--evalue {self.evalue} ' if self.bit: cmd += f'--min-score {self.bit} ' if self.percent_id_cutoff: cmd += f'--id {self.percent_id_cutoff*100} ' if self.aln_query: cmd += f"--query-cover {self.aln_query*100} " if self.aln_reference: cmd += f"--subject-cover {self.aln_reference*100} " run_command(cmd)
def call_proteins(self, genome_directory): ''' Use prodigal to call proteins within the genomes Parameters ---------- genome_directory - string. Directory containing .fna files for each input genome Outputs ------- returns the directory containing an .faa file for each input genomes ''' protein_directory_path = path.join(self.output_directory, self.GENOME_PROTEINS) gene_directory_path = path.join(self.output_directory, self.GENOME_GENES) mkdir(protein_directory_path) mkdir(gene_directory_path) genome_list = list() genome_paths = list() for genome in listdir(genome_directory): if genome.endswith(self.suffix): genome_paths.append(path.splitext(genome)[0]) logging.info(" - Calling proteins for %i genomes", len(genome_paths)) cmd = "ls %s/*%s | \ sed 's/%s//g' | \ grep -o '[^/]*$' | \ parallel -j %s \ prodigal \ -q \ -p meta \ -o /dev/null \ -d %s/{}%s \ -a %s/{}%s \ -i %s/{}%s \ > /dev/null 2>&1" \ % (genome_directory, self.suffix, self.suffix, self.parallel, gene_directory_path, self.suffix, protein_directory_path, self.PROTEINS_SUFFIX, genome_directory, self.suffix) run_command(cmd) protein_directory_files = listdir(protein_directory_path) genome_directory_files = listdir(genome_directory) for genome_protein, genome_nucl in zip(protein_directory_files, genome_directory_files): genome_protein_base = genome_protein.replace(self.PROTEINS_SUFFIX, self.suffix) output_genome_protein_path = path.join(protein_directory_path, genome_protein) output_genome_nucl_path = path.join(genome_directory, genome_nucl) output_genome_gene_path = path.join(gene_directory_path, genome_protein_base) genome = (self.light, output_genome_protein_path, output_genome_nucl_path, output_genome_gene_path) genome_list.append(genome) return genome_list
def draw_barplots(self, annotation_matrix, pvalue, output_directory): logging.info(' - Generating KO breakdown plots') cmd = f"Rscript {self.draw_barplots_script_path} \ -i {annotation_matrix} \ -o {output_directory} \ -k {self.ko00000} \ -p {pvalue} > /dev/null 2>&1" run_command(cmd)
def draw_pca_plot(self, annotation_matrix, metadata, output_directory): logging.info(' - Generating PCA plot') output_path = os.path.join(output_directory, self.output_pca_plot) cmd = f"Rscript {self.draw_pca_script_path} \ -i {annotation_matrix} \ -m {metadata} \ -o {output_path} > /dev/null 2>&1" run_command(cmd)
def hmm_search(self, output_path, database, hmmcutoff): ''' Carry out a hmmsearch. Parameters ---------- input_genome_path - string. Path to file containing .faa file for an input genome output_path - string. Path to file to output results into databases - string. Path to HMM to use for searching ''' input_genome_path = path.join(self.output_directory, self.GENOME_PROTEINS) cmd = "ls %s | sed 's/%s//g' | parallel -j %s\ hmmsearch \ --cpu %s \ -o /dev/null \ --noali \ --domtblout %s/{}%s " \ % (input_genome_path, self.PROTEINS_SUFFIX, self.parallel, self.threads, output_path, self.ANNOTATION_SUFFIX) if hmmcutoff: if (self.cut_ga_pfam or self.cut_nc_pfam or self.cut_tc_pfam) and 'pfam' in database: if self.cut_ga_pfam: cmd += " --cut_ga " if self.cut_nc_pfam: cmd += " --cut_nc " if self.cut_tc_pfam: cmd += " --cut_tc " elif (self.cut_ga_tigrfam or self.cut_nc_tigrfam or self.cut_tc_tigrfam) and 'tigrfam' in database: if self.cut_ga_tigrfam: cmd += " --cut_ga " if self.cut_nc_tigrfam: cmd += " --cut_nc " if self.cut_tc_tigrfam: cmd += " --cut_tc " else: cmd += self._default_hmmsearch_options() else: cmd += self._default_hmmsearch_options() cmd += "%s %s/{}.faa 2> /dev/null" % (database, input_genome_path) run_command(cmd)
def _download_db(self, new_db_file): ''' Download and decompress a new database file Parameters ---------- new_db_file - String. File name of new database to download and decompress. ''' new_db_path_archive \ = os.path.join(self.DATABASE_DIR, new_db_file) logging.info('Downloading new database: %s', new_db_file) cmd = f'wget \ -q {self.ftp + new_db_file} \ -O {new_db_path_archive}' run_command(cmd) cmd = f'wget \ -q {self.ftp + self.VERSION} \ -O {os.path.join(self.DATABASE_DIR, self.VERSION)}' run_command(cmd) logging.info('Decompressing new database') cmd = 'tar -xvzf %s -C %s > /dev/null' % (new_db_path_archive, self.DATABASE_DIR) run_command(cmd) logging.info('Cleaning up') os.remove(new_db_path_archive)
def run_mcl(self, blast_abc, output_directory_path): ''' Parse the protein clusters producedf from Mmseqs2 using mcl Parameters ---------- blast_abc - string. an abc file for mcl to run on. More information on the format of abc files can be found at https://micans.org/mcl/man/clmprotocols.html output_directory_path - string. Path to write the results of mcl parsing to. ''' dict_path = path.join(output_directory_path, "alignDb.dict") mci_path = path.join(output_directory_path, "alignDb.mci") cluster_path = path.join(output_directory_path, "mcl_clusters.tsv") output_path = path.join(output_directory_path, "mcl_clusters.convert.tsv") logging.info(' - Preparing network') ortholog_dict = dict() cmd = f"mcxload \ -abc {blast_abc} \ -write-tab {dict_path} \ -o {mci_path} \ --stream-mirror \ --stream-neg-log10 \ > /dev/null 2>&1" run_command(cmd) logging.info(' - Finding orthologs') ortholog_dict = dict() cmd = f'mcl \ {mci_path} \ -te {self.threads} \ -I {self.inflation} \ -o {cluster_path} \ > /dev/null 2>&1' run_command(cmd) logging.info(' - Reformatting output') ortholog_dict = dict() cmd = f'mcxdump \ -icl {cluster_path} \ -o {output_path} \ -tabr {dict_path} \ > /dev/null 2>&1' run_command(cmd) ortholog = 1 for line in open(output_path): ortholog_idx = "ortholog_%i" % ortholog ortholog_dict[ortholog_idx] = set() for protein in line.strip().split('\t'): ortholog_dict[ortholog_idx].add(protein) ortholog += 1 return ortholog_dict
def annotate_hypothetical(self, genomes_list): ''' Sort proteins coded by each genome into homologous clusters. Inputs ------ genomes_list - list. list of Genome objects ''' output_directory_path = path.join(self.output_directory, self.GENOME_HYPOTHETICAL) mkdir(output_directory_path) with tempfile.NamedTemporaryFile() as temp: to_write = str() for genome in genomes_list: to_write += f"sed \"s/>/>{genome.name}~/g\" {genome.path}\n" temp.flush() tmp_dir = tempfile.mkdtemp() db_path = path.join(output_directory_path, "db") clu_path = path.join(output_directory_path, "clu") align_path = path.join(output_directory_path, "alignDb") blast_output_path = path.join(output_directory_path, "alignDb.m8") formatted_blast_output_path = path.join(output_directory_path, "alignDb.formatted.m8") clu_tsv_path = path.join(output_directory_path, "hypothetical_clusters.tsv") logging.info(' - Generating MMSeqs2 database') cmd = "bash %s | sponge | mmseqs createdb /dev/stdin %s -v 0 > /dev/null 2>&1" % ( temp.name, db_path) run_command(cmd) logging.info(' - Clustering genome proteins') cmd = f"mmseqs cluster \ {db_path} \ {clu_path} \ {tmp_dir} \ --max-seqs 1000 \ --threads {self.threads} \ --min-seq-id {self.percent_id_cutoff} \ -e {self.evalue} \ -c {self.fraction_aligned} \ -v 0 " run_command(cmd) logging.info(' - Extracting clusters') cmd = 'mmseqs createtsv %s %s %s %s -v 0 > /dev/null 2>&1' % ( db_path, db_path, clu_path, clu_tsv_path) run_command(cmd) logging.info( ' - Computing Smith-Waterman alignments for clustering results' ) cmd = "mmseqs alignall %s %s %s --alignment-mode 3 -v 0 " % ( db_path, clu_path, align_path) run_command(cmd) logging.info(' - Converting to BLAST-like output') cmd = "mmseqs createtsv %s %s %s %s -v 0 > /dev/null 2>&1 " % ( db_path, db_path, align_path, blast_output_path) # --format-output query,target,bits run_command(cmd) logging.info(' - Reformatting BLAST output') cmd = "OFS=\"\t\" awk 'FNR==NR{a[$1]=$2;next}{$3=a[$3]; \ $1=\"\"; for(i=2;i<NF;i++){printf(\"%s\t\",$i)} \ printf(\"\\n\")}' %s %s | cut -f1,2,5 > %s" \ % ("%s", db_path + '.lookup', blast_output_path, formatted_blast_output_path) run_command(cmd) ortholog_dict = self.run_mcl(formatted_blast_output_path, output_directory_path) ortholog_ids = ortholog_dict.keys() cluster_ids = self.parse_cluster_results(clu_tsv_path, genomes_list, ortholog_dict, output_directory_path) return cluster_ids, ortholog_ids
def annotate_hypothetical(self, genomes_list): ''' Sort proteins coded by each genome into homologous clusters. Inputs ------ genomes_list - list. list of Genome objects ''' output_directory_path = path.join(self.output_directory, self.GENOME_HYPOTHETICAL) mkdir(output_directory_path) renamed_genomes = list() for genome in genomes_list: renamed_genome = next(tempfile._get_candidate_names()) cmd = f"sed 's/>/>{genome.name}~/g' {genome.path} > {renamed_genome}" run_command(cmd) renamed_genomes.append(renamed_genome) tmp_dir = tempfile.mkdtemp() db_path = path.join(output_directory_path, "db") clu_path = path.join(output_directory_path, "clu") align_path = path.join(output_directory_path, "alignDb") blast_output_path = path.join(output_directory_path, "alignDb.m8") formatted_blast_output_path = path.join(output_directory_path, "alignDb.formatted.m8") clu_tsv_path = path.join(output_directory_path, "hypothetical_clusters.tsv") logging.info(' - Generating MMSeqs2 database') cmd = f"mmseqs createdb {' '.join(renamed_genomes)} {db_path}" run_command(cmd) for renamed_genome in renamed_genomes: os.remove(renamed_genome) logging.info(' - Clustering genome proteins') cmd = f"mmseqs cluster \ {db_path} \ {clu_path} \ {tmp_dir} \ --threads {self.threads} \ --min-seq-id {self.percent_id_cutoff} \ -c {self.fraction_aligned} \ -v 0" run_command(cmd) logging.info(' - Extracting clusters') cmd = f'mmseqs createtsv \ {db_path} \ {db_path} \ {clu_path} \ {clu_tsv_path} \ --threads {self.threads} \ -v 0' run_command(cmd) if self.annotate_ortholog: logging.info( ' - Computing Smith-Waterman alignments for clustering results' ) cmd = f"mmseqs alignall \ {db_path} \ {clu_path} \ {align_path} \ --alignment-mode 3 \ --threads {self.threads} \ -v 0" run_command(cmd) logging.info(' - Converting to BLAST-like output') cmd = f"mmseqs createtsv \ {db_path} \ {db_path} \ {align_path} \ {blast_output_path} \ --threads {self.threads} \ -v 0" # --format-output query,target,bits run_command(cmd) logging.info(' - Reformatting BLAST output') cmd = "OFS=\"\t\" awk 'FNR==NR{a[$1]=$2;next}{$3=a[$3]; \ $1=\"\"; for(i=2;i<NF;i++){printf(\"%s\t\",$i)} \ printf(\"\\n\")}' %s %s | cut -f1,2,5 > %s" \ % ("%s", db_path + '.lookup', blast_output_path, formatted_blast_output_path) run_command(cmd) ortholog_dict = self.run_mcl(formatted_blast_output_path, output_directory_path) ortholog_ids = ortholog_dict.keys() else: ortholog_dict = dict() ortholog_ids = list() cluster_ids = self.parse_cluster_results(clu_tsv_path, genomes_list, ortholog_dict, output_directory_path) return cluster_ids, ortholog_ids