Пример #1
0
    def test_hypergeom_p_values(self):
        results = [0.16666666666666669, 0.49999999999999989, 1.0, 0.49999999999999989, 1.0]

        # calculating on counts
        pvals = hypergeom_p_values(self.x, self.x[-2:, :])
        np.testing.assert_almost_equal(pvals, results)

        # calculating on 0,1s
        clipped = self.x.clip(min=0, max=1)
        pvals = hypergeom_p_values(clipped, clipped[-2:, :])
        np.testing.assert_almost_equal(pvals, results)
Пример #2
0
    def test_hypergeom_p_values(self):
        results = [0.16666666666666669, 0.49999999999999989, 1.0, 0.49999999999999989, 1.0]

        # calculating on counts
        pvals = hypergeom_p_values(self.x, self.x[-2:, :])
        np.testing.assert_almost_equal(pvals, results)

        # calculating on sparse counts
        pvals = hypergeom_p_values(sp.csr_matrix(self.x), self.x[-2:, :])
        np.testing.assert_almost_equal(pvals, results)

        # calculating on 0,1s
        clipped = self.x.clip(min=0, max=1)
        pvals = hypergeom_p_values(clipped, clipped[-2:, :])
        np.testing.assert_almost_equal(pvals, results)

        with self.assertRaises(ValueError):
            hypergeom_p_values(self.x, self.x[-2:, :-1])
Пример #3
0
    def apply(self):
        self.clear()
        self.progressBarInit()
        self.filter_enabled(False)

        self.words = [i.name for i in self.selected_data_transformed.domain.attributes]
        self.p_values = hypergeom_p_values(self.data.X, self.selected_data_transformed.X, callback=self.progress)
        self.fdr_values = false_discovery_rate(self.p_values)
        self.filter_and_display()
        self.filter_enabled(True)
        self.progressBarFinished()
Пример #4
0
    def test_hypergeom_p_values(self):
        results = [
            0.16666666666666669, 0.49999999999999989, 1.0, 0.49999999999999989,
            1.0
        ]

        # calculating on counts
        pvals = hypergeom_p_values(self.x, self.x[-2:, :])
        np.testing.assert_almost_equal(pvals, results)

        # calculating on sparse counts
        pvals = hypergeom_p_values(sp.csr_matrix(self.x), self.x[-2:, :])
        np.testing.assert_almost_equal(pvals, results)

        # calculating on 0,1s
        clipped = self.x.clip(min=0, max=1)
        pvals = hypergeom_p_values(clipped, clipped[-2:, :])
        np.testing.assert_almost_equal(pvals, results)

        with self.assertRaises(ValueError):
            hypergeom_p_values(self.x, self.x[-2:, :-1])
Пример #5
0
 def run(selected_data_transformed: Table, data: Table, result: Result,
         state: TaskState) -> None:
     state.set_status("Listing words")
     result.words = [
         i.name for i in selected_data_transformed.domain.attributes
     ]
     state.set_status("Computing p-values")
     result.p_values = hypergeom_p_values(data.X,
                                          selected_data_transformed.X,
                                          callback=state.set_progress_value)
     state.set_status("Computing FDR values")
     result.fdr_values = FDR(result.p_values)
Пример #6
0
    def apply(self):
        self.clear()
        self.progressBarInit()
        self.filter_enabled(False)

        self.words = [i.name for i in self.selected_data_transformed.domain.attributes]
        self.p_values = hypergeom_p_values(self.data.X,
                                           self.selected_data_transformed.X,
                                           callback=self.progress)
        self.fdr_values = false_discovery_rate(self.p_values)
        self.filter_and_display()
        self.filter_enabled(True)
        self.progressBarFinished()
Пример #7
0
def _hypergeom_clusters(
    cluster_labels: np.ndarray, keywords: List[List[str]],
    fdr_threshold: float, n_words: int
) -> Tuple[Dict[int, List[str]], np.ndarray, np.ndarray, np.ndarray]:
    keywords = [[w for w, _ in doc_keywords] for doc_keywords in keywords]

    clusters_keywords = {}
    for label in sorted(set(cluster_labels) - {-1}):
        indices = set(np.flatnonzero(cluster_labels == label))
        kwds = [k for i, k in enumerate(keywords) if i in indices]
        clusters_keywords[label] = kwds

    cv = CountVectorizer(tokenizer=lambda w: w, preprocessor=lambda w: w)
    X = cv.fit_transform(list(chain.from_iterable(clusters_keywords.values())))
    all_keywords = np.array(cv.get_feature_names_out())

    index = 0
    selected_clusters_keywords = {}
    all_scores, all_p_values = [], []
    for label, cls_kwds in clusters_keywords.items():
        # find words that should be specific for a group with hypergeom test
        n_docs = len(cls_kwds)
        p_values = hypergeom_p_values(X, X[index:index + n_docs])
        words = set(all_keywords[np.array(p_values) < fdr_threshold])

        # select only words with p-values less than threshold
        sel_words = [w for w in chain.from_iterable(cls_kwds)]
        sel_words = [w for w in sel_words if w in words]
        sel_words = [(w, c / n_docs)
                     for w, c in Counter(sel_words).most_common(n_words)]
        selected_clusters_keywords[label] = sel_words

        all_scores.append(X[index:index + n_docs].sum(axis=0) / n_docs)
        all_p_values.append(p_values)

        index += n_docs

    all_scores = np.vstack(all_scores)
    all_p_values = np.vstack(all_p_values)
    return selected_clusters_keywords, all_keywords, all_scores, all_p_values