def get_coverage_and_detection_dict(self,bin_id): _bin = summarizer.Bin(self.summary, bin_id) self.coverage_values_per_nt = get_coverage_values_per_nucleotide(_bin.split_coverage_values_per_nt_dict, self.samples) # getting the total length of all contigs self.total_length = _bin.total_length self.init_coverage_and_detection_dataframes(_bin.gene_coverages, _bin.gene_detection)
def get_gene_presence_in_the_environment_dict(self): if not isinstance(self.fraction_of_median_coverage, float): raise ConfigError("Fraction of median coverage must of type `float`.") if not isinstance(self.min_detection, float): raise ConfigError("Minimum detection must be of type `float`") self.run.info('Fraction of median coverage for core genes', self.fraction_of_median_coverage) self.run.info('Min detection of a genome in at last one metagenome', self.min_detection) self.progress.new('Working on gene presence/absence') self.progress.update('...') gene_presence_in_the_environment_dict = {} for profile_db_path in self.unique_profile_db_path_to_internal_genome_name: self.progress.update('Collection info from profile db at %s ...' % (profile_db_path)) summary = self.get_summary_object_for_profile_db(profile_db_path) for internal_genome_name in self.unique_profile_db_path_to_internal_genome_name[profile_db_path]: genome_name = self.descriptions.genomes[internal_genome_name]['bin_id'] self.progress.update('Working on genome %s in profile db %s ...' % (internal_genome_name, profile_db_path)) # for each genome, first we will see whether it is detected in at least one metagenome detection_across_metagenomes = summary.collection_profile[genome_name]['detection'] num_metagenomes_above_min_detection = [m for m in detection_across_metagenomes if detection_across_metagenomes[m] > self.min_detection] not_enough_detection = False if len(num_metagenomes_above_min_detection) else True gene_presence_in_the_environment_dict[genome_name] = {} split_names_of_interest = self.descriptions.get_split_names_of_interest_for_internal_genome(self.descriptions.genomes[internal_genome_name]) genome_bin_summary = summarizer.Bin(summary, genome_name, split_names_of_interest) gene_coverages_across_samples = utils.get_values_of_gene_level_coverage_stats_as_dict(genome_bin_summary.gene_level_coverage_stats_dict, "mean_coverage") # at this point we have all the genes in the genome bin. what we need is to characterize their detection. first, # summarize the coverage of each gene in all samples: sum_gene_coverages_across_samples = dict([(gene_callers_id, sum(gene_coverages_across_samples[gene_callers_id].values())) for gene_callers_id in gene_coverages_across_samples]) # now we will identify the median coverage median_coverage_across_samples = numpy.median(list(sum_gene_coverages_across_samples.values())) # now we will store decide whether a gene found in this genome is also found in the environment, and store that # information into `gene_presence_in_the_environment_dict`, and move on to the next stage. for gene_caller_id in sum_gene_coverages_across_samples: if not_enough_detection: _class = 'NA' elif sum_gene_coverages_across_samples[gene_caller_id] < median_coverage_across_samples * self.fraction_of_median_coverage: _class = 'EAG' else: _class = 'ECG' gene_presence_in_the_environment_dict[genome_name][gene_caller_id] = _class self.progress.end() return gene_presence_in_the_environment_dict
def get_coverage_and_detection_dict(self, bin_id): _bin = summarizer.Bin(self.summary, bin_id) self.gene_coverages = pd.DataFrame.from_dict(_bin.gene_coverages, orient='index', dtype=float) print(self.gene_coverages) self.gene_coverages.drop(self.samples_to_exclude, axis=1, inplace=True) self.Ng = len(self.gene_coverages.index) self.gene_detections = pd.DataFrame.from_dict(_bin.gene_detection, orient='index', dtype=float) self.gene_detections.drop(self.samples_to_exclude, axis=1, inplace=True) self.samples = set(self.gene_coverages.columns.values)
def get_coverage_and_detection_dict(self, bin_id): _bin = summarizer.Bin(self.summary, bin_id) self.gene_coverages = _bin.gene_coverages self.gene_detection = _bin.gene_detection self.samples = set(next(iter(self.gene_coverages.values())).keys())