def pathway_enrichment(genesets, genes, reference, prob=None, callback=None): result_sets = [] p_values = [] if prob is None: prob = statistics.Hypergeometric() for i, gs in enumerate(genesets): cluster = gs.genes.intersection(genes) ref = gs.genes.intersection(reference) k = len(cluster) N = len(reference) m = len(ref) n = len(genes) if k: p_val = prob.p_value(k, N, m, n) result_sets.append((gs.gs_id, cluster, ref)) p_values.append(p_val) if callback is not None: callback(100.0 * i / len(genesets)) # FDR correction p_values = statistics.FDR(p_values) return { _id: (genes, p_val, len(ref)) for (_id, genes, ref), p_val in zip(result_sets, p_values) }
def __on_enrichment_done(self, results): # type: (Future[Dict[str, tuple]]) -> None self.progressBarFinished(processEvents=False) self.setBlocking(False) self.setStatusMessage("") if self.__state & State.Stale: self.__state = State.Ready self.__invalidate() return self.__state = State.Ready try: results = results.result() # type: Dict[str, tuple] except Exception as ex: results = {} error = str(ex) self.error(1, error) if results: terms = list(results.items()) fdr_vals = statistics.FDR([d[1] for _, d in terms]) terms = [(key, d + (fdr, )) for (key, d), fdr in zip(terms, fdr_vals)] terms = dict(terms) else: terms = {} self.terms = terms if not self.terms: self.warning(0, "No enriched terms found.") else: self.warning(0) self.treeStructDict = {} ids = self.terms.keys() self.treeStructRootKey = None parents = {} for _id in ids: parents[_id] = {term for _, term in self.ontology[_id].related} children = {} for term in self.terms: children[term] = {id for id in ids if term in parents[id]} for term in self.terms: self.treeStructDict[term] = TreeNode(self.terms[term], children[term]) if not self.ontology[term].related and not getattr( self.ontology[term], "is_obsolete", False): self.treeStructRootKey = term self.set_graph(terms) self._update_enrichment_report_output() self.commit()
def hg_cell(item_attributes): p_values = [] scores = [] for i, (ct, attributes) in enumerate(grouped_annotations_items): intersect = item_attributes & attributes x = len(intersect) k = len(item_attributes) # drawn balls - expressed for item m = len(attributes) # marked balls - items for a process if x > 2: # avoid the heavy computation when intersect small p_value = p_fun(x, N, m, k) else: p_value = 1 p_values.append(p_value) if scoring == SCORING_EXP_RATIO: scores.append(x / (m + 1e-16)) fdrs = statistics.FDR(p_values) if scoring == SCORING_LOG_FDR or scoring == SCORING_LOG_PVALUE: scores = AnnotateSamples._scores_fdr( fdrs if scoring == SCORING_LOG_FDR else p_values) return scores, fdrs
def get_enriched_terms( self, genes, reference=None, evidence_codes=None, slims_only=False, aspect=None, prob=statistics.Binomial(), use_fdr=True, progress_callback=None, ): """ Return a dictionary of enriched terms, with tuples of (list_of_genes, p_value, reference_count) for items and term ids as keys. P-Values are FDR adjusted if use_fdr is True (default). :param genes: List of genes :param reference: List of genes (if None all genes included in the annotations will be used). :param evidence_codes: List of evidence codes to consider. :param slims_only: If `True` return only slim terms. :param aspect: Which aspects to use. Use all by default; one of Process (biological process), Function (molecular function) or Component (cellular component) :param prob: :param use_fdr: :param progress_callback: """ all_genes = set(genes) if aspect is None: aspects_set = {'Process', 'Component', 'Function'} elif isinstance(aspect, str): aspects_set = {aspect} else: aspects_set = aspect if reference is None: reference = self.genes() evidence_codes = set(evidence_codes or evidence_dict.keys()) annotations = [ ann for gene in genes for ann in self.gene_annotations[gene] if ann.evidence in evidence_codes and ann.aspect in aspects_set ] ref_annotations = { ann for gene in reference for ann in self.gene_annotations[gene] if ann.evidence in evidence_codes and ann.aspect in aspects_set } annotations_dict = defaultdict(set) for ann in annotations: annotations_dict[ann.go_id].add(ann) self._ensure_ontology() if slims_only and not self.ontology.slims_subset: warnings.warn( "Unspecified slims subset in the ontology! " "Using 'goslim_generic' subset", UserWarning) self.ontology.set_slims_subset('goslim_generic') terms = annotations_dict.keys() filtered_terms = [term for term in terms if term in self.ontology] if len(terms) != len(filtered_terms): term_diff = set(terms) - set(filtered_terms) warnings.warn( "%s terms in the annotations were not found in the " "ontology." % ",".join(map(repr, term_diff)), UserWarning, ) terms = self.ontology.extract_super_graph(filtered_terms) res = {} milestones = progress_bar_milestones(len(terms), 100) for i, term in enumerate(terms): if slims_only and term not in self.ontology.slims_subset: continue all_annotations = self.get_annotations_by_go_id(term).intersection( ref_annotations) all_annotated_genes = {ann.gene_id for ann in all_annotations} mapped_genes = all_genes.intersection(all_annotated_genes) if len(reference) > len(all_annotated_genes): mapped_reference_genes = reference.intersection( all_annotated_genes) else: mapped_reference_genes = all_annotated_genes.intersection( reference) res[term] = ( [gene for gene in mapped_genes], prob.p_value(len(mapped_genes), len(reference), len(mapped_reference_genes), len(genes)), len(mapped_reference_genes), ) if progress_callback and i in milestones: progress_callback(100.0 * i / len(terms)) if use_fdr: res = sorted(res.items(), key=lambda x: x[1][1]) res = { id: (genes, p, ref) for (id, (genes, _, ref)), p in zip( res, statistics.FDR([p for _, (_, p, _) in res])) } return res