def populate_gene_cluster_homogeneity_index(self, gene_clusters_dict): if self.skip_alignments: self.run.warning( 'Skipping homogeneity calculations because gene clusters are not alligned.' ) return if self.skip_homogeneity: self.run.warning( "Skipping homogeneity calculations per the '--skip-homogeneity' flag." ) return pan = dbops.PanSuperclass(args=self.args, r=self.run, p=self.progress) gene_cluster_names = set(list(gene_clusters_dict.keys())) d = pan.compute_homogeneity_indices_for_gene_clusters( gene_cluster_names=gene_cluster_names, num_threads=self.num_threads) if d is None: self.run.warning( "Anvi'o received an empty dictionary for homogeneity indices. Not good :/ Returning empty handed,\ without updating anything in the pan database..." ) return miscdata.TableForItemAdditionalData(self.args).add( d, [ 'functional_homogeneity_index', 'geometric_homogeneity_index', 'combined_homogeneity_index' ], skip_check_names=True)
def process_gene_clusters(self, gene_clusters_dict): self.progress.new('Generating view data') self.progress.update('...') gene_clusters = list(gene_clusters_dict.keys()) for genome_name in self.genomes: self.genomes[genome_name]['singleton_gene_clusters'] = 0 self.genomes[genome_name]['num_gene_clusters_raw'] = 0 for gene_cluster in gene_clusters: self.view_data[gene_cluster] = dict([(genome_name, 0) for genome_name in self.genomes]) self.view_data_presence_absence[gene_cluster] = dict([(genome_name, 0) for genome_name in self.genomes]) self.additional_view_data[gene_cluster] = {'num_genes_in_gene_cluster': 0, 'num_genomes_gene_cluster_has_hits': 0, 'SCG': 0, 'max_num_paralogs': 0} for gene_entry in gene_clusters_dict[gene_cluster]: genome_name = gene_entry['genome_name'] self.view_data[gene_cluster][genome_name] += 1 self.view_data_presence_absence[gene_cluster][genome_name] = 1 self.additional_view_data[gene_cluster]['num_genes_in_gene_cluster'] += 1 self.genomes[genome_name]['num_gene_clusters_raw'] += 1 genomes_contributing_to_gene_cluster = [t[0] for t in self.view_data_presence_absence[gene_cluster].items() if t[1]] if len(genomes_contributing_to_gene_cluster) == 1: self.genomes[genomes_contributing_to_gene_cluster[0]]['singleton_gene_clusters'] += 1 self.additional_view_data[gene_cluster]['SCG'] = 1 if set(self.view_data[gene_cluster].values()) == set([1]) else 0 self.additional_view_data[gene_cluster]['max_num_paralogs'] = max(self.view_data[gene_cluster].values()) self.additional_view_data[gene_cluster]['num_genomes_gene_cluster_has_hits'] = len([True for genome in self.view_data[gene_cluster] if self.view_data[gene_cluster][genome] > 0]) self.progress.end() ######################################################################################## # FILTERING BASED ON OCCURRENCE ######################################################################################## gene_clusters_of_interest = set([]) for gene_cluster in gene_clusters: if self.additional_view_data[gene_cluster]['num_genomes_gene_cluster_has_hits'] >= self.gene_cluster_min_occurrence: gene_clusters_of_interest.add(gene_cluster) removed_gene_clusters = 0 for gene_cluster in gene_clusters: if gene_cluster not in gene_clusters_of_interest: self.view_data.pop(gene_cluster) self.view_data_presence_absence.pop(gene_cluster) self.additional_view_data.pop(gene_cluster) gene_clusters_dict.pop(gene_cluster) removed_gene_clusters += 1 if self.gene_cluster_min_occurrence > 1: self.run.info('gene_clusters min occurrence', '%d (the filter removed %d gene_clusters)' % (self.gene_cluster_min_occurrence, removed_gene_clusters)) ######################################################################################## # CAN WE CLUSTER THIS STUFF? DOES THE USER WANT US TO TRY REGARDLESS? ######################################################################################## if len(gene_clusters_dict) > self.max_num_gene_clusters_for_hierarchical_clustering: if self.enforce_hierarchical_clustering: self.run.warning("You have %s gene_clusters, which exceeds the number of gene_clusters anvi'o is comfortable to cluster. But\ since you have used the flag `--enforce-hierarchical-clustering`, anvi'o will attempt\ to create a hierarchical clustering of your gene_clusters anyway. It may take a bit of \ time. Pour yourself a coffee. Or go to a nice vacation. See you in 10 mins, or next year \ or never." % pp(len(gene_clusters_dict))) else: self.run.warning("It seems you have %s gene clusters in your pangenome. This exceeds the soft limit\ of %s for anvi'o to attempt to create a hierarchical clustering of your gene clusters\ (which becomes the center tree in all anvi'o displays). If you want a hierarchical\ clustering to be done anyway, please see the flag `--enforce-hierarchical-clustering`." \ % (pp(len(gene_clusters_dict)), pp(self.max_num_gene_clusters_for_hierarchical_clustering))) self.skip_hierarchical_clustering = True ######################################################################################## # STORING FILTERED DATA IN THE DB ######################################################################################## table_structure=['gene_cluster'] + sorted(self.genomes.keys()) table_types=['text'] + ['numeric'] * len(self.genomes) TablesForViews(self.pan_db_path).create_new_view( data_dict=self.view_data, table_name='gene_cluster_frequencies', table_structure=table_structure, table_types=table_types, view_name = 'gene_cluster_frequencies') TablesForViews(self.pan_db_path).create_new_view( data_dict=self.view_data_presence_absence, table_name='gene_cluster_presence_absence', table_structure=table_structure, table_types=table_types, view_name = 'gene_cluster_presence_absence') item_additional_data_table = miscdata.TableForItemAdditionalData(self.args) item_additional_data_keys = ['num_genomes_gene_cluster_has_hits', 'num_genes_in_gene_cluster', 'max_num_paralogs', 'SCG'] item_additional_data_table.add(self.additional_view_data, item_additional_data_keys, skip_check_names=True) # ^^^^^^^^^^^^^^^^^^^^^ # / # here we say skip_check_names=True, simply because there is no gene_clusters table has not been # generated yet, but the check names functionality in dbops looks for the gene clsuters table to # be certain. it is not a big deal here, since we absoluely know what gene cluster names we are # working with. ######################################################################################## # RETURN THE -LIKELY- UPDATED PROTEIN CLUSTERS DICT ######################################################################################## return gene_clusters_dict