def get_enriched_pathways(self,
                              genes,
                              reference=None,
                              prob=statistics.Binomial(),
                              callback=None):
        """
        Return a dictionary with enriched pathways ids as keys
        and (list_of_genes, p_value, num_of_reference_genes) tuples
        as items.

        """
        if reference is None:
            reference = self.genes.keys()
        reference = set(reference)

        allPathways = defaultdict(lambda: [[], 1.0, []])
        milestones = progress_bar_milestones(len(genes), 100)
        pathways_db = KEGGPathways()

        pathways_for_gene = []
        for i, gene in enumerate(genes):
            pathways_for_gene.append(self.pathways([gene]))
            if callback and i in milestones:
                callback(i * 50.0 / len(genes))

        # pre-cache for speed
        pathways_db.pre_cache(
            [pid for pfg in pathways_for_gene for pid in pfg])
        for i, (gene, pathways) in enumerate(zip(genes, pathways_for_gene)):
            for pathway in pathways:
                if pathways_db.get_entry(pathway).gene:
                    allPathways[pathway][0].append(gene)
            if callback and i in milestones:
                callback(50.0 + i * 50.0 / len(genes))

        pItems = allPathways.items()

        for i, (p_id, entry) in enumerate(pItems):
            pathway = pathways_db.get_entry(p_id)
            entry[2].extend(reference.intersection(pathway.gene or []))
            entry[1] = prob.p_value(len(entry[0]), len(reference),
                                    len(entry[2]), len(genes))
        return dict([(pid, (genes, p, len(ref)))
                     for pid, (genes, p, ref) in allPathways.items()])
    def get_enriched_terms(
            self,
            genes,
            reference=None,
            evidence_codes=None,
            slims_only=False,
            aspect=None,
            prob=statistics.Binomial(),
            use_fdr=True,
            progress_callback=None,
    ):
        """
        Return a dictionary of enriched terms, with tuples of
        (list_of_genes, p_value, reference_count) for items and term
        ids as keys. P-Values are FDR adjusted if use_fdr is True (default).

        :param genes: List of genes
        :param reference: List of genes (if None all genes included in the annotations will be used).
        :param evidence_codes:  List of evidence codes to consider.
        :param slims_only: If `True` return only slim terms.
        :param aspect: Which aspects to use. Use all by default;
                       one of Process (biological process),
                       Function (molecular function) or Component (cellular component)
        :param prob:
        :param use_fdr:
        :param progress_callback:
        """

        all_genes = set(genes)

        if aspect is None:
            aspects_set = {'Process', 'Component', 'Function'}
        elif isinstance(aspect, str):
            aspects_set = {aspect}
        else:
            aspects_set = aspect

        if reference is None:
            reference = self.genes()

        evidence_codes = set(evidence_codes or evidence_dict.keys())
        annotations = [
            ann for gene in genes for ann in self.gene_annotations[gene]
            if ann.evidence in evidence_codes and ann.aspect in aspects_set
        ]

        ref_annotations = {
            ann
            for gene in reference for ann in self.gene_annotations[gene]
            if ann.evidence in evidence_codes and ann.aspect in aspects_set
        }

        annotations_dict = defaultdict(set)
        for ann in annotations:
            annotations_dict[ann.go_id].add(ann)

        self._ensure_ontology()

        if slims_only and not self.ontology.slims_subset:
            warnings.warn(
                "Unspecified slims subset in the ontology! "
                "Using 'goslim_generic' subset", UserWarning)
            self.ontology.set_slims_subset('goslim_generic')

        terms = annotations_dict.keys()
        filtered_terms = [term for term in terms if term in self.ontology]

        if len(terms) != len(filtered_terms):
            term_diff = set(terms) - set(filtered_terms)
            warnings.warn(
                "%s terms in the annotations were not found in the "
                "ontology." % ",".join(map(repr, term_diff)),
                UserWarning,
            )

        terms = self.ontology.extract_super_graph(filtered_terms)
        res = {}

        milestones = progress_bar_milestones(len(terms), 100)

        for i, term in enumerate(terms):
            if slims_only and term not in self.ontology.slims_subset:
                continue
            all_annotations = self.get_annotations_by_go_id(term).intersection(
                ref_annotations)
            all_annotated_genes = {ann.gene_id for ann in all_annotations}
            mapped_genes = all_genes.intersection(all_annotated_genes)

            if len(reference) > len(all_annotated_genes):
                mapped_reference_genes = reference.intersection(
                    all_annotated_genes)
            else:
                mapped_reference_genes = all_annotated_genes.intersection(
                    reference)

            res[term] = (
                [gene for gene in mapped_genes],
                prob.p_value(len(mapped_genes), len(reference),
                             len(mapped_reference_genes), len(genes)),
                len(mapped_reference_genes),
            )

            if progress_callback and i in milestones:
                progress_callback(100.0 * i / len(terms))

        if use_fdr:
            res = sorted(res.items(), key=lambda x: x[1][1])
            res = {
                id: (genes, p, ref)
                for (id, (genes, _, ref)), p in zip(
                    res, statistics.FDR([p for _, (_, p, _) in res]))
            }
        return res
    def __init__(self, parent=None):
        super().__init__(self, parent)

        self.input_data = None
        self.ref_data = None
        self.ontology = None
        self.annotations = None
        self.loaded_annotation_code = None
        self.treeStructRootKey = None
        self.probFunctions = [statistics.Binomial(), statistics.Hypergeometric()]
        self.selectedTerms = []

        self.selectionChanging = 0
        self.__state = State.Ready
        self.__scheduletimer = QTimer(self, singleShot=True)
        self.__scheduletimer.timeout.connect(self.__update)

        #############
        # GUI
        #############
        self.tabs = gui.tabWidget(self.controlArea)
        # Input tab
        self.inputTab = gui.createTabPage(self.tabs, "Input")
        box = gui.widgetBox(self.inputTab, "Info")
        self.infoLabel = gui.widgetLabel(box, "No data on input\n")

        gui.button(box, self, "Ontology/Annotation Info",
                   callback=self.ShowInfo,
                   tooltip="Show information on loaded ontology and annotations")

        self.referenceRadioBox = gui.radioButtonsInBox(
            self.inputTab, self, "useReferenceDataset",
            ["Entire genome", "Reference set (input)"],
            tooltips=["Use entire genome for reference",
                      "Use genes from Referece Examples input signal as reference"],
            box="Reference", callback=self.__invalidate)

        self.referenceRadioBox.buttons[1].setDisabled(True)
        gui.radioButtonsInBox(
            self.inputTab, self, "aspectIndex",
            ["Biological process", "Cellular component", "Molecular function"],
            box="Aspect", callback=self.__invalidate)

        # Filter tab
        self.filterTab = gui.createTabPage(self.tabs, "Filter")
        box = gui.widgetBox(self.filterTab, "Filter GO Term Nodes")
        gui.checkBox(box, self, "filterByNumOfInstances", "Genes",
                     callback=self.FilterAndDisplayGraph,
                     tooltip="Filter by number of input genes mapped to a term")
        ibox = gui.indentedBox(box)
        gui.spin(ibox, self, 'minNumOfInstances', 1, 100,
                 step=1, label='#:', labelWidth=15,
                 callback=self.FilterAndDisplayGraph,
                 callbackOnReturn=True,
                 tooltip="Min. number of input genes mapped to a term")

        gui.checkBox(box, self, "filterByPValue_nofdr", "p-value",
                     callback=self.FilterAndDisplayGraph,
                     tooltip="Filter by term p-value")

        gui.doubleSpin(gui.indentedBox(box), self, 'maxPValue_nofdr', 1e-8, 1,
                       step=1e-8,  label='p:', labelWidth=15,
                       callback=self.FilterAndDisplayGraph,
                       callbackOnReturn=True,
                       tooltip="Max term p-value")

        # use filterByPValue for FDR, as it was the default in prior versions
        gui.checkBox(box, self, "filterByPValue", "FDR",
                     callback=self.FilterAndDisplayGraph,
                     tooltip="Filter by term FDR")
        gui.doubleSpin(gui.indentedBox(box), self, 'maxPValue', 1e-8, 1,
                       step=1e-8,  label='p:', labelWidth=15,
                       callback=self.FilterAndDisplayGraph,
                       callbackOnReturn=True,
                       tooltip="Max term p-value")

        box = gui.widgetBox(box, "Significance test")

        gui.radioButtonsInBox(box, self, "probFunc", ["Binomial", "Hypergeometric"],
                              tooltips=["Use binomial distribution test",
                                        "Use hypergeometric distribution test"],
                              callback=self.__invalidate)  # TODO: only update the p values
        box = gui.widgetBox(self.filterTab, "Evidence codes in annotation",
                              addSpace=True)
        self.evidenceCheckBoxDict = {}
        for etype in go.evidenceTypesOrdered:
            ecb = QCheckBox(
                etype, toolTip=go.evidenceTypes[etype],
                checked=self.useEvidenceType[etype])
            ecb.toggled.connect(self.__on_evidenceChanged)
            box.layout().addWidget(ecb)
            self.evidenceCheckBoxDict[etype] = ecb

        # Select tab
        self.selectTab = gui.createTabPage(self.tabs, "Select")
        box = gui.radioButtonsInBox(
            self.selectTab, self, "selectionDirectAnnotation",
            ["Directly or Indirectly", "Directly"],
            box="Annotated genes",
            callback=self.ExampleSelection)

        box = gui.widgetBox(self.selectTab, "Output", addSpace=True)
        gui.radioButtonsInBox(
            box, self, "selectionDisjoint",
            btnLabels=["All selected genes",
                       "Term-specific genes",
                       "Common term genes"],
            tooltips=["Outputs genes annotated to all selected GO terms",
                      "Outputs genes that appear in only one of selected GO terms",
                      "Outputs genes common to all selected GO terms"],
            callback=self.ExampleSelection)

        # ListView for DAG, and table for significant GOIDs
        self.DAGcolumns = ['GO term', 'Cluster', 'Reference', 'p-value',
                           'FDR', 'Genes', 'Enrichment']

        self.splitter = QSplitter(Qt.Vertical, self.mainArea)
        self.mainArea.layout().addWidget(self.splitter)

        # list view
        self.listView = GOTreeWidget(self.splitter)
        self.listView.setSelectionMode(QTreeView.ExtendedSelection)
        self.listView.setAllColumnsShowFocus(1)
        self.listView.setColumnCount(len(self.DAGcolumns))
        self.listView.setHeaderLabels(self.DAGcolumns)

        self.listView.header().setSectionsClickable(True)
        self.listView.header().setSortIndicatorShown(True)
        self.listView.header().setSortIndicator(self.DAGcolumns.index('p-value'), Qt.AscendingOrder)
        self.listView.setSortingEnabled(True)
        self.listView.setItemDelegateForColumn(
            6, EnrichmentColumnItemDelegate(self))
        self.listView.setRootIsDecorated(True)

        self.listView.itemSelectionChanged.connect(self.ViewSelectionChanged)

        # table of significant GO terms
        self.sigTerms = QTreeWidget(self.splitter)
        self.sigTerms.setColumnCount(len(self.DAGcolumns))
        self.sigTerms.setHeaderLabels(self.DAGcolumns)
        self.sigTerms.setSortingEnabled(True)
        self.sigTerms.setSelectionMode(QTreeView.ExtendedSelection)
        self.sigTerms.header().setSortIndicator(self.DAGcolumns.index('p-value'), Qt.AscendingOrder)
        self.sigTerms.setItemDelegateForColumn(
            6, EnrichmentColumnItemDelegate(self))

        self.sigTerms.itemSelectionChanged.connect(self.TableSelectionChanged)

        self.sigTableTermsSorted = []
        self.graph = {}
        self.originalGraph = None

        self.inputTab.layout().addStretch(1)
        self.filterTab.layout().addStretch(1)
        self.selectTab.layout().addStretch(1)

        class AnnotationSlot(SimpleNamespace):
            taxid = ...  # type: str
            name = ...   # type: str
            filename = ...  # type:str

            @staticmethod
            def parse_tax_id(f_name):
                return f_name.split('.')[1]

        try:
            remote_files = serverfiles.ServerFiles().listfiles(DOMAIN)
        except (ConnectTimeout, RequestException, ConnectionError):
            # TODO: Warn user about failed connection to the remote server
            remote_files = []

        self.available_annotations = [
            AnnotationSlot(
                taxid=AnnotationSlot.parse_tax_id(annotation_file),
                name=taxonomy.common_taxid_to_name(AnnotationSlot.parse_tax_id(annotation_file)),
                filename=FILENAME_ANNOTATION.format(AnnotationSlot.parse_tax_id(annotation_file))
            )
            for _, annotation_file in set(remote_files + serverfiles.listfiles(DOMAIN))
            if annotation_file != FILENAME_ONTOLOGY

        ]
        self._executor = ThreadExecutor()
    def assign_annotations(items_sets, available_annotations, data,
                           p_value_fun=PFUN_BINOMIAL,
                           scoring=SCORING_EXP_RATIO):
        """
        The function gets a set of attributes (e.g. genes) for each cell and
        attributes for each annotation. It returns the annotations significant
        for each cell.

        Parameters
        ----------
        items_sets : list of sets
            Set of most important attributes for each item.
        available_annotations : Orange.data.Table
            Available annotations (e.g. cell types)
        p_value_fun : str, optional (defaults: TEST_BINOMIAL)
            A function that calculates p-value. It can be either
            PFUN_BINOMIAL that uses statistics.Binomial().p_value or
            PFUN_HYPERGEOMETRIC that uses hypergeom.sf.
        data : Orange.data.Table
            Tabular data with gene expressions - we need that to compute scores.
        scoring : str, optional (default=SCORING_EXP_RATIO)
            Type of scoring

        Returns
        -------
        Orange.data.Table
            Annotation probabilities
        Orange.data.Table
            Annotation fdrs
        """
        assert TAX_ID in data.attributes, "The input table needs to have a " \
                                          "tax_id attribute"
        tax_id = data.attributes[TAX_ID]

        # select function for p-value
        if p_value_fun == PFUN_HYPERGEOMETRIC:  # sf accept x-1 instead of x
            p_fun = lambda x, n, m, k: hypergeom.sf(x-1, n, m, k)
        else:
            p_fun = statistics.Binomial().p_value

        # retrieve number of genes for organism
        N = len(GeneInfo(tax_id))

        grouped_annotations_items, genes_celltypes = \
            AnnotateSamples._group_marker_attributes(
                available_annotations,
                [d.attributes.get("Entrez ID")
                 for d in data.domain.attributes])

        def hg_cell(item_attributes):
            p_values = []
            scores = []
            for i, (ct, attributes) in enumerate(grouped_annotations_items):
                intersect = item_attributes & attributes
                x = len(intersect)
                k = len(item_attributes)  # drawn balls - expressed for item
                m = len(attributes)  # marked balls - items for a process

                if x > 2:  # avoid the heavy computation when intersect small
                    p_value = p_fun(x, N, m, k)
                else:
                    p_value = 1
                p_values.append(p_value)

                if scoring == SCORING_EXP_RATIO:
                    scores.append(x / (m + 1e-16))

            fdrs = statistics.FDR(p_values)
            if scoring == SCORING_LOG_FDR or scoring == SCORING_LOG_PVALUE:
                scores = AnnotateSamples._scores_fdr(
                    fdrs if scoring == SCORING_LOG_FDR else p_values)

            return scores, fdrs

        prob_fdrs = [hg_cell(its) for its in items_sets]
        probs, fdrs = zip(*prob_fdrs)

        if scoring == SCORING_MARKERS_SUM:
            probs = AnnotateSamples._scores_markers_sum(data, genes_celltypes)

        domain = Domain(
            [ContinuousVariable(ct[0]) for ct in grouped_annotations_items])
        probs_table = Table(domain, np.array(probs))
        fdrs_table = Table(domain, np.array(fdrs))

        return probs_table, fdrs_table