def __call__(self, data): method = self.method # select default method according to the provided data if method is None: autoMethod = True discr_ratio = (sum(a.is_discrete for a in data.domain.attributes) / len(data.domain.attributes)) if data.domain.has_discrete_class: if discr_ratio >= 0.5: method = GainRatio() else: method = ANOVA() else: method = UnivariateLinearRegression() features = data.domain.attributes try: scores = method(data) except ValueError: scores = self.score_only_nice_features(data, method) best = sorted(zip(scores, features), key=itemgetter(0), reverse=self.decreasing) if self.k: best = best[:self.k] if self.threshold: pred = ((lambda x: x[0] >= self.threshold) if self.decreasing else (lambda x: x[0] <= self.threshold)) best = takewhile(pred, best) domain = Orange.data.Domain([f for s, f in best], data.domain.class_vars, data.domain.metas) return data.transform(domain)
def test_select_threshold(self): data = Table('wine') anova = ANOVA() t = 30 data2 = SelectBestFeatures(method=anova, threshold=t)(data) self.assertTrue( all(anova(data, f) >= t for f in data2.domain.attributes))
def test_select_threshold(self): anova = ANOVA() t = 30 data2 = SelectBestFeatures(method=anova, threshold=t)(self.heart_disease) self.assertTrue( all( anova(self.heart_disease, f) >= t for f in data2.domain.attributes))
def test_wrong_class_type(self): scorers = [Gini(), InfoGain(), GainRatio()] for scorer in scorers: with self.assertRaises(ValueError): scorer(self.housing, 0) with self.assertRaises(ValueError): Chi2()(self.housing, 0) with self.assertRaises(ValueError): ANOVA()(self.housing, 2) UnivariateLinearRegression()(self.housing, 2)
def test_anova(self): nrows, ncols = 500, 5 X = np.random.rand(nrows, ncols) y = 4 + (-3 * X[:, 1] + X[:, 3]) // 2 domain = Domain.from_numpy(X, y) domain = Domain(domain.attributes, DiscreteVariable('c', values=np.unique(y))) data = Table(domain, X, y) scorer = ANOVA() sc = [scorer(data, a) for a in range(ncols)] self.assertTrue(np.argmax(sc) == 1)
def __call__(self, data): method = self.method # select default method according to the provided data if method is None: autoMethod = True discr_ratio = (sum(a.is_discrete for a in data.domain.attributes) / len(data.domain.attributes)) if data.domain.has_discrete_class: if discr_ratio >= 0.5: method = GainRatio() else: method = ANOVA() else: method = UnivariateLinearRegression() if not isinstance(data.domain.class_var, method.class_type): raise ValueError( ("Scoring method {} requires a class variable " + "of type {}.").format((method if type(method) == type else type(method)).__name__, method.class_type.__name__)) features = data.domain.attributes try: scores = method(data) except ValueError: scores = self.score_only_nice_features(data, method) best = sorted(zip(scores, features), key=itemgetter(0), reverse=self.decreasing) if self.k: best = best[:self.k] if self.threshold: pred = ((lambda x: x[0] >= self.threshold) if self.decreasing else (lambda x: x[0] <= self.threshold)) best = takewhile(pred, best) domain = Orange.data.Domain([f for s, f in best], data.domain.class_vars, data.domain.metas) return data.from_table(domain, data)