Exemplo n.º 1
0
    def _archive_db(self, old_db_file):
        '''
        Archive an old database file

        Parameters
        ----------
        old_db_file	- String. File name of old database file to archive
        '''

        old_database_path = os.path.join(self.DATABASE_DIR, "old")
        if not os.path.isdir(old_database_path):
            logging.info('Creating directory to store databases: %s' %
                         (old_database_path))
            os.makedirs(old_database_path)

        old_db_path_archive \
            = os.path.join(old_database_path, old_db_file + self.ARCHIVE_SUFFIX)
        old_db_path \
            = os.path.join(self.DATABASE_DIR, old_db_file)

        logging.info('Compressing old database')
        cmd = "tar -cvzf %s %s > /dev/null" % (old_db_path_archive,
                                               old_db_path)
        run_command(cmd)

        logging.info('Cleaning up')
        shutil.rmtree(old_db_path)
Exemplo n.º 2
0
    def diamond_search(self, tmp_name, output_path, database):
        '''
        Carry out a diamond blastp search.

        Parameters
        ----------
        input_genome_path - string. Path to file containing .faa file for an input genome
        output_path - string. Path to file to output results into
        databases - string. Path to HMM to use for searching
        '''

        cmd = f'bash {tmp_name} | diamond blastp \
                                    --quiet \
                                    --outfmt 6 \
                                    --max-target-seqs 1 \
                                    --query /dev/stdin \
                                    --out {output_path} \
                                    --db {database} \
                                    --threads {self.threads} '
        if self.evalue:
            cmd += f'--evalue {self.evalue} '

        if self.bit:
            cmd += f'--min-score {self.bit} '

        if self.percent_id_cutoff:
            cmd += f'--id {self.percent_id_cutoff*100} '

        if self.aln_query:
            cmd += f"--query-cover {self.aln_query*100} "

        if self.aln_reference:
            cmd += f"--subject-cover {self.aln_reference*100} "

        run_command(cmd)
Exemplo n.º 3
0
    def call_proteins(self, genome_directory):
        '''
        Use prodigal to call proteins within the genomes

        Parameters
        ----------
        genome_directory  - string. Directory containing .fna files for each
                            input genome

        Outputs
        -------
        returns the directory containing an .faa file for each input genomes
        '''
        protein_directory_path = path.join(self.output_directory, self.GENOME_PROTEINS)
        gene_directory_path = path.join(self.output_directory, self.GENOME_GENES)
        mkdir(protein_directory_path)
        mkdir(gene_directory_path)
        genome_list = list()
        genome_paths = list()

        for genome in listdir(genome_directory):

            if genome.endswith(self.suffix):
                genome_paths.append(path.splitext(genome)[0])

        logging.info("    - Calling proteins for %i genomes", len(genome_paths))
        cmd = "ls %s/*%s | \
                    sed 's/%s//g' | \
                    grep -o '[^/]*$' | \
                    parallel -j %s \
                        prodigal \
                            -q \
                            -p meta \
                            -o /dev/null \
                            -d %s/{}%s \
                            -a %s/{}%s \
                            -i %s/{}%s \
                            > /dev/null 2>&1" \
                % (genome_directory, self.suffix, self.suffix, self.parallel, gene_directory_path,
                   self.suffix, protein_directory_path, self.PROTEINS_SUFFIX, genome_directory,
                   self.suffix)

        run_command(cmd)

        protein_directory_files = listdir(protein_directory_path)
        genome_directory_files = listdir(genome_directory)

        for genome_protein, genome_nucl in zip(protein_directory_files, genome_directory_files):
            genome_protein_base = genome_protein.replace(self.PROTEINS_SUFFIX, self.suffix)
            output_genome_protein_path = path.join(protein_directory_path, genome_protein)
            output_genome_nucl_path = path.join(genome_directory, genome_nucl)
            output_genome_gene_path = path.join(gene_directory_path, genome_protein_base)

            genome = (self.light, output_genome_protein_path, output_genome_nucl_path,
                      output_genome_gene_path)
            genome_list.append(genome)

        return genome_list
Exemplo n.º 4
0
    def draw_barplots(self, annotation_matrix, pvalue, output_directory):
        logging.info('	- Generating KO breakdown plots')
        cmd = f"Rscript {self.draw_barplots_script_path} \
                    -i {annotation_matrix} \
                    -o {output_directory} \
                    -k {self.ko00000} \
                    -p {pvalue} > /dev/null 2>&1"

        run_command(cmd)
Exemplo n.º 5
0
    def draw_pca_plot(self, annotation_matrix, metadata, output_directory):
        logging.info('	- Generating PCA plot')
        output_path = os.path.join(output_directory, self.output_pca_plot)
        cmd = f"Rscript {self.draw_pca_script_path} \
                    -i {annotation_matrix} \
                    -m {metadata} \
                    -o {output_path} > /dev/null 2>&1"

        run_command(cmd)
Exemplo n.º 6
0
    def hmm_search(self, output_path, database, hmmcutoff):
        '''
        Carry out a hmmsearch.

        Parameters
        ----------
        input_genome_path     - string. Path to file containing .faa file for
                                an input genome
        output_path           - string. Path to file to output results into
        databases             - string. Path to HMM to use for searching
        '''

        input_genome_path = path.join(self.output_directory,
                                      self.GENOME_PROTEINS)
        cmd = "ls %s | sed 's/%s//g' | parallel -j %s\
                                                hmmsearch \
                                                    --cpu %s \
                                                    -o /dev/null \
                                                    --noali \
                                                    --domtblout %s/{}%s " \
                          % (input_genome_path, self.PROTEINS_SUFFIX, self.parallel,
                             self.threads, output_path, self.ANNOTATION_SUFFIX)
        if hmmcutoff:
            if (self.cut_ga_pfam or self.cut_nc_pfam
                    or self.cut_tc_pfam) and 'pfam' in database:
                if self.cut_ga_pfam:
                    cmd += " --cut_ga "
                if self.cut_nc_pfam:
                    cmd += " --cut_nc "
                if self.cut_tc_pfam:
                    cmd += " --cut_tc "
            elif (self.cut_ga_tigrfam or self.cut_nc_tigrfam
                  or self.cut_tc_tigrfam) and 'tigrfam' in database:
                if self.cut_ga_tigrfam:
                    cmd += " --cut_ga "
                if self.cut_nc_tigrfam:
                    cmd += " --cut_nc "
                if self.cut_tc_tigrfam:
                    cmd += " --cut_tc "
            else:
                cmd += self._default_hmmsearch_options()
        else:
            cmd += self._default_hmmsearch_options()

        cmd += "%s %s/{}.faa 2> /dev/null" % (database, input_genome_path)

        run_command(cmd)
Exemplo n.º 7
0
    def _download_db(self, new_db_file):
        '''
        Download and decompress a new database file

        Parameters
        ----------
        new_db_file	- String. File name of new database to download and decompress.
        '''

        new_db_path_archive \
            = os.path.join(self.DATABASE_DIR, new_db_file)

        logging.info('Downloading new database: %s', new_db_file)
        cmd = f'wget \
                    -q {self.ftp + new_db_file} \
                    -O {new_db_path_archive}'

        run_command(cmd)

        cmd = f'wget \
                    -q {self.ftp + self.VERSION} \
                    -O {os.path.join(self.DATABASE_DIR, self.VERSION)}'

        run_command(cmd)

        logging.info('Decompressing new database')
        cmd = 'tar -xvzf %s -C %s > /dev/null' % (new_db_path_archive,
                                                  self.DATABASE_DIR)
        run_command(cmd)

        logging.info('Cleaning up')
        os.remove(new_db_path_archive)
Exemplo n.º 8
0
    def run_mcl(self, blast_abc, output_directory_path):
        '''
        Parse the protein clusters producedf from Mmseqs2 using mcl

        Parameters
        ----------
        blast_abc - string. an abc file for mcl to run on. More information on the format of abc
                    files can be found at https://micans.org/mcl/man/clmprotocols.html
        output_directory_path - string. Path to write the results of mcl parsing to.
        '''

        dict_path = path.join(output_directory_path, "alignDb.dict")
        mci_path = path.join(output_directory_path, "alignDb.mci")
        cluster_path = path.join(output_directory_path, "mcl_clusters.tsv")
        output_path = path.join(output_directory_path, "mcl_clusters.convert.tsv")

        logging.info('    - Preparing network')
        ortholog_dict = dict()
        cmd = f"mcxload \
                    -abc {blast_abc} \
                    -write-tab {dict_path} \
                    -o {mci_path} \
                    --stream-mirror \
                    --stream-neg-log10 \
                    > /dev/null 2>&1"
        run_command(cmd)

        logging.info('    - Finding orthologs')
        ortholog_dict = dict()
        cmd = f'mcl \
                    {mci_path} \
                    -te {self.threads} \
                    -I {self.inflation} \
                    -o {cluster_path} \
                    > /dev/null 2>&1'
        run_command(cmd)

        logging.info('    - Reformatting output')
        ortholog_dict = dict()
        cmd = f'mcxdump \
                    -icl {cluster_path} \
                    -o {output_path} \
                    -tabr {dict_path} \
                    > /dev/null 2>&1'
        run_command(cmd)

        ortholog = 1
        for line in open(output_path):
            ortholog_idx = "ortholog_%i" % ortholog
            ortholog_dict[ortholog_idx] = set()

            for protein in line.strip().split('\t'):
                ortholog_dict[ortholog_idx].add(protein)

            ortholog += 1

        return ortholog_dict
Exemplo n.º 9
0
    def annotate_hypothetical(self, genomes_list):
        '''
        Sort proteins coded by each genome into homologous clusters.

        Inputs
        ------
        genomes_list - list. list of Genome objects

        '''
        output_directory_path = path.join(self.output_directory,
                                          self.GENOME_HYPOTHETICAL)
        mkdir(output_directory_path)

        with tempfile.NamedTemporaryFile() as temp:

            to_write = str()

            for genome in genomes_list:
                to_write += f"sed \"s/>/>{genome.name}~/g\" {genome.path}\n"

            temp.flush()

            tmp_dir = tempfile.mkdtemp()

            db_path = path.join(output_directory_path, "db")
            clu_path = path.join(output_directory_path, "clu")
            align_path = path.join(output_directory_path, "alignDb")
            blast_output_path = path.join(output_directory_path, "alignDb.m8")
            formatted_blast_output_path = path.join(output_directory_path,
                                                    "alignDb.formatted.m8")

            clu_tsv_path = path.join(output_directory_path,
                                     "hypothetical_clusters.tsv")

            logging.info('    - Generating MMSeqs2 database')
            cmd = "bash %s | sponge | mmseqs createdb /dev/stdin %s -v 0 > /dev/null 2>&1" % (
                temp.name, db_path)
            run_command(cmd)

            logging.info('    - Clustering genome proteins')
            cmd = f"mmseqs cluster \
                        {db_path} \
                        {clu_path} \
                        {tmp_dir} \
                        --max-seqs 1000 \
                        --threads {self.threads} \
                        --min-seq-id {self.percent_id_cutoff} \
                        -e {self.evalue} \
                        -c {self.fraction_aligned} \
                        -v 0 "

            run_command(cmd)

            logging.info('    - Extracting clusters')
            cmd = 'mmseqs createtsv %s %s %s %s -v 0 > /dev/null 2>&1' % (
                db_path, db_path, clu_path, clu_tsv_path)
            run_command(cmd)

            logging.info(
                '    - Computing Smith-Waterman alignments for clustering results'
            )
            cmd = "mmseqs alignall %s %s %s --alignment-mode 3 -v 0  " % (
                db_path, clu_path, align_path)
            run_command(cmd)

            logging.info('    - Converting to BLAST-like output')
            cmd = "mmseqs createtsv %s %s %s %s -v 0 > /dev/null 2>&1   " % (
                db_path, db_path, align_path, blast_output_path)
            # --format-output query,target,bits
            run_command(cmd)

            logging.info('    - Reformatting BLAST output')
            cmd = "OFS=\"\t\" awk 'FNR==NR{a[$1]=$2;next}{$3=a[$3]; \
                                            $1=\"\"; for(i=2;i<NF;i++){printf(\"%s\t\",$i)} \
                                            printf(\"\\n\")}' %s %s | cut -f1,2,5 > %s" \
                % ("%s", db_path + '.lookup', blast_output_path, formatted_blast_output_path)
            run_command(cmd)

        ortholog_dict = self.run_mcl(formatted_blast_output_path,
                                     output_directory_path)
        ortholog_ids = ortholog_dict.keys()
        cluster_ids = self.parse_cluster_results(clu_tsv_path, genomes_list,
                                                 ortholog_dict,
                                                 output_directory_path)
        return cluster_ids, ortholog_ids
Exemplo n.º 10
0
    def annotate_hypothetical(self, genomes_list):
        '''
        Sort proteins coded by each genome into homologous clusters.

        Inputs
        ------
        genomes_list - list. list of Genome objects

        '''
        output_directory_path = path.join(self.output_directory,
                                          self.GENOME_HYPOTHETICAL)
        mkdir(output_directory_path)

        renamed_genomes = list()
        for genome in genomes_list:
            renamed_genome = next(tempfile._get_candidate_names())
            cmd = f"sed 's/>/>{genome.name}~/g' {genome.path} > {renamed_genome}"
            run_command(cmd)
            renamed_genomes.append(renamed_genome)

        tmp_dir = tempfile.mkdtemp()

        db_path = path.join(output_directory_path, "db")
        clu_path = path.join(output_directory_path, "clu")
        align_path = path.join(output_directory_path, "alignDb")
        blast_output_path = path.join(output_directory_path, "alignDb.m8")
        formatted_blast_output_path = path.join(output_directory_path,
                                                "alignDb.formatted.m8")

        clu_tsv_path = path.join(output_directory_path,
                                 "hypothetical_clusters.tsv")

        logging.info('    - Generating MMSeqs2 database')
        cmd = f"mmseqs createdb {' '.join(renamed_genomes)} {db_path}"
        run_command(cmd)
        for renamed_genome in renamed_genomes:
            os.remove(renamed_genome)

        logging.info('    - Clustering genome proteins')
        cmd = f"mmseqs cluster \
                    {db_path} \
                    {clu_path} \
                    {tmp_dir} \
                    --threads {self.threads} \
                    --min-seq-id {self.percent_id_cutoff} \
                    -c {self.fraction_aligned} \
                    -v 0"

        run_command(cmd)

        logging.info('    - Extracting clusters')
        cmd = f'mmseqs createtsv \
                    {db_path} \
                    {db_path} \
                    {clu_path} \
                    {clu_tsv_path} \
                    --threads {self.threads} \
                    -v 0'

        run_command(cmd)

        if self.annotate_ortholog:

            logging.info(
                '    - Computing Smith-Waterman alignments for clustering results'
            )
            cmd = f"mmseqs alignall \
                        {db_path} \
                        {clu_path} \
                        {align_path} \
                        --alignment-mode 3 \
                        --threads {self.threads} \
                        -v 0"

            run_command(cmd)

            logging.info('    - Converting to BLAST-like output')
            cmd = f"mmseqs createtsv \
                        {db_path} \
                        {db_path} \
                        {align_path} \
                        {blast_output_path} \
                        --threads {self.threads} \
                        -v 0"

            # --format-output query,target,bits
            run_command(cmd)

            logging.info('    - Reformatting BLAST output')
            cmd = "OFS=\"\t\" awk 'FNR==NR{a[$1]=$2;next}{$3=a[$3]; \
                                            $1=\"\"; for(i=2;i<NF;i++){printf(\"%s\t\",$i)} \
                                            printf(\"\\n\")}' %s %s | cut -f1,2,5 > %s" \
                % ("%s", db_path + '.lookup', blast_output_path, formatted_blast_output_path)
            run_command(cmd)

            ortholog_dict = self.run_mcl(formatted_blast_output_path,
                                         output_directory_path)
            ortholog_ids = ortholog_dict.keys()
        else:
            ortholog_dict = dict()
            ortholog_ids = list()
        cluster_ids = self.parse_cluster_results(clu_tsv_path, genomes_list,
                                                 ortholog_dict,
                                                 output_directory_path)
        return cluster_ids, ortholog_ids