示例#1
0
    def store_gene_clusters(self, gene_clusters_dict):
        self.progress.new('Storing gene clusters in the database')
        self.progress.update('...')

        table_for_gene_clusters = TableForGeneClusters(self.pan_db_path,
                                                       run=self.run,
                                                       progress=self.progress)

        num_genes_in_gene_clusters = 0
        for gene_cluster_name in gene_clusters_dict:
            for gene_entry in gene_clusters_dict[gene_cluster_name]:
                table_for_gene_clusters.add(gene_entry)
                num_genes_in_gene_clusters += 1

        self.progress.end()

        table_for_gene_clusters.store()

        pan_db = dbops.PanDatabase(self.pan_db_path, quiet=True)
        pan_db.db.set_meta_value('num_gene_clusters', len(gene_clusters_dict))
        pan_db.db.set_meta_value('num_genes_in_gene_clusters',
                                 num_genes_in_gene_clusters)
        pan_db.disconnect()

        self.run.info(
            'gene clusters info', '%d gene_clusters stored in the database' %
            len(gene_clusters_dict))
示例#2
0
    def store_PCs(self, PCs_dict):
        self.progress.new('Storing protein clusters in the database')
        self.progress.update('...')

        table_for_PCs = dbops.TableForProteinClusters(self.pan_db_path,
                                                      run=self.run,
                                                      progress=self.progress)

        num_genes_in_PCs = 0
        for pc_name in PCs_dict:
            for gene_entry in PCs_dict[pc_name]:
                table_for_PCs.add(gene_entry)
                num_genes_in_PCs += 1

        self.progress.end()

        table_for_PCs.store()

        pan_db = dbops.PanDatabase(self.pan_db_path, quiet=True)
        pan_db.db.set_meta_value('num_PCs', len(PCs_dict))
        pan_db.db.set_meta_value('num_genes_in_PCs', num_genes_in_PCs)
        pan_db.disconnect()

        self.run.info('protein clusters info',
                      '%d PCs stored in the database' % len(PCs_dict))
示例#3
0
    def generate_pan_db(self):
        meta_values = {'internal_genome_names': ','.join(self.internal_genome_names),
                       'external_genome_names': ','.join(self.external_genome_names),
                       'num_genomes': len(self.genomes),
                       'min_percent_identity': self.min_percent_identity,
                       'pc_min_occurrence': self.PC_min_occurrence,
                       'mcl_inflation': self.mcl_inflation,
                       'default_view': 'PC_presence_absence',
                       'use_ncbi_blast': self.use_ncbi_blast,
                       'diamond_sensitive': self.sensitive,
                       'maxbit': self.maxbit,
                       'exclude_partial_gene_calls': self.exclude_partial_gene_calls,
                       'gene_alignments_computed': False if self.skip_alignments else True,
                       'genomes_storage_hash': self.genomes_storage_hash,
                       'project_name': self.project_name,
                       'PCs_clustered': False,
                       'description': self.description if self.description else '_No description is provided_',
                      }

        dbops.PanDatabase(self.pan_db_path, quiet=False).create(meta_values)
示例#4
0
    def process_PCs(self, PCs_dict):
        self.progress.new('Generating view data')
        self.progress.update('...')

        PCs = list(PCs_dict.keys())

        for PC in PCs:
            self.view_data[PC] = dict([(genome_name, 0)
                                       for genome_name in self.genomes])
            self.view_data_presence_absence[PC] = dict([
                (genome_name, 0) for genome_name in self.genomes
            ])
            self.additional_view_data[PC] = {
                'num_genes_in_pc': 0,
                'num_genomes_pc_has_hits': 0,
                'SCG': 0
            }

            for gene_entry in PCs_dict[PC]:
                genome_name = gene_entry['genome_name']

                self.view_data[PC][genome_name] += 1
                self.view_data_presence_absence[PC][genome_name] = 1
                self.additional_view_data[PC]['num_genes_in_pc'] += 1

            self.additional_view_data[PC]['SCG'] = 1 if set(
                self.view_data[PC].values()) == set([1]) else 0

            self.additional_view_data[PC]['num_genomes_pc_has_hits'] = len([
                True for genome in self.view_data[PC]
                if self.view_data[PC][genome] > 0
            ])

        self.progress.end()

        ########################################################################################
        #                           FILTERING BASED ON OCCURRENCE
        ########################################################################################
        PCs_of_interest = set([])
        for PC in PCs:
            if self.additional_view_data[PC][
                    'num_genomes_pc_has_hits'] >= self.PC_min_occurrence:
                PCs_of_interest.add(PC)

        removed_PCs = 0
        for PC in PCs:
            if PC not in PCs_of_interest:
                self.view_data.pop(PC)
                self.view_data_presence_absence.pop(PC)
                self.additional_view_data.pop(PC)
                PCs_dict.pop(PC)
                removed_PCs += 1

        if self.PC_min_occurrence > 1:
            self.run.info(
                'PCs min occurrence', '%d (the filter removed %d PCs)' %
                (self.PC_min_occurrence, removed_PCs))

        ########################################################################################
        #            CAN WE CLUSTER THIS STUFF? DOES THE USER WANT US TO TRY REGARDLESS?
        ########################################################################################
        if len(PCs_dict) > self.max_num_PCs_for_hierarchical_clustering:
            if self.enforce_hierarchical_clustering:
                self.run.warning(
                    "You have %s PCs, which exceeds the number of PCs anvi'o is comfortable to cluster. But\
                                  since you have used the flag `--enforce-hierarchical-clustering`, anvi'o will attempt\
                                  to create a hierarchical clustering of your PCs anyway. It may take a bit of \
                                  time. Pour yourself a coffee. Or go to a nice vacation. See you in 10 mins, or next year \
                                  or never." % pp(len(PCs_dict)))
            else:
                self.run.warning("It seems you have %s protein clusters in your pangenome. This exceeds the soft limit\
                                  of %s for anvi'o to attempt to create a hierarchical clustering of your protein clusters\
                                  (which becomes the center tree in all anvi'o displays). If you want a hierarchical\
                                  clustering to be done anyway, please see the flag `--enforce-hierarchical-clustering`." \
                                            % (pp(len(PCs_dict)), pp(self.max_num_PCs_for_hierarchical_clustering)))
                self.skip_hierarchical_clustering = True

        ########################################################################################
        #                           STORING FILTERED DATA IN THE DB
        ########################################################################################
        table_structure = ['PC'] + sorted(self.genomes.keys())
        table_types = ['text'] + ['numeric'] * len(self.genomes)
        dbops.TablesForViews(self.pan_db_path).create_new_view(
            data_dict=self.view_data,
            table_name='PC_frequencies',
            table_structure=table_structure,
            table_types=table_types,
            view_name='PC_frequencies')

        dbops.TablesForViews(self.pan_db_path).create_new_view(
            data_dict=self.view_data_presence_absence,
            table_name='PC_presence_absence',
            table_structure=table_structure,
            table_types=table_types,
            view_name='PC_presence_absence')

        additional_data_structure = [
            'PC', 'num_genomes_pc_has_hits', 'num_genes_in_pc', 'SCG'
        ]
        dbops.TablesForViews(self.pan_db_path).create_new_view(
            data_dict=self.additional_view_data,
            table_name='additional_data',
            table_structure=additional_data_structure,
            table_types=['text', 'numeric', 'numeric', 'numeric'],
            view_name=None)

        # add additional data structure to the self table, so we can have them initially ordered
        # in the interface the way additional_data_structure suggests:
        pan_db = dbops.PanDatabase(self.pan_db_path, quiet=True)
        pan_db.db.set_meta_value('additional_data_headers',
                                 ','.join(additional_data_structure[1:]))
        pan_db.disconnect()

        ########################################################################################
        #                   RETURN THE -LIKELY- UPDATED PROTEIN CLUSTERS DICT
        ########################################################################################
        return PCs_dict