def test_similarity_tree(): # Test that rules are well splitted rules = [ ("a <= 2 and b > 45 and c <= 3 and a > 4", (1, 1, 0)), ("a <= 2 and b > 45 and c <= 3 and a > 4", (1, 1, 0)), ("a > 2 and b > 45", (0.5, 0.3, 0)), ("a > 2 and b > 40", (0.5, 0.2, 0)), ("a <= 2 and b <= 45", (1, 1, 0)), ("a > 2 and c <= 3", (1, 1, 0)), ("b > 45", (1, 1, 0)), ] sk = SkopeRules(max_depth_duplication=2) rulesets = sk._find_similar_rulesets(rules) # Assert some couples of rules are in the same bag idx_bags_rules = [] for idx_rule, r in enumerate(rules): idx_bags_for_rule = [] for idx_bag, bag in enumerate(rulesets): if r in bag: idx_bags_for_rule.append(idx_bag) idx_bags_rules.append(idx_bags_for_rule) assert_equal(idx_bags_rules[0], idx_bags_rules[1]) assert_not_equal(idx_bags_rules[0], idx_bags_rules[2]) # Assert the best rules are kept final_rules = sk.deduplicate(rules) assert_in(rules[0], final_rules) assert_in(rules[2], final_rules) assert_not_in(rules[3], final_rules)
def test_boundaries(): # ensure min_samples is inclusive of core point core, _ = dbscan([[0], [1]], eps=2, min_samples=2) assert_in(0, core) # ensure eps is inclusive of circumference core, _ = dbscan([[0], [1], [1]], eps=1, min_samples=2) assert_in(0, core) core, _ = dbscan([[0], [1], [1]], eps=.99, min_samples=2) assert_not_in(0, core)
def test_scorefunc_multilabel(): # Test whether k-best and percentiles works with multilabels with chi2. X = np.array([[10000, 9999, 0], [100, 9999, 0], [1000, 99, 0]]) y = [[1, 1], [0, 1], [1, 0]] Xt = SelectKBest(chi2, k=2).fit_transform(X, y) assert_equal(Xt.shape, (3, 2)) assert_not_in(0, Xt) Xt = SelectPercentile(chi2, percentile=67).fit_transform(X, y) assert_equal(Xt.shape, (3, 2)) assert_not_in(0, Xt)
def test_tied_pvalues(): # Test whether k-best and percentiles work with tied pvalues from chi2. # chi2 will return the same p-values for the following features, but it # will return different scores. X0 = np.array([[10000, 9999, 9998], [1, 1, 1]]) y = [0, 1] for perm in itertools.permutations((0, 1, 2)): X = X0[:, perm] Xt = SelectKBest(chi2, k=2).fit_transform(X, y) assert_equal(Xt.shape, (2, 2)) assert_not_in(9998, Xt) Xt = SelectPercentile(chi2, percentile=67).fit_transform(X, y) assert_equal(Xt.shape, (2, 2)) assert_not_in(9998, Xt)
def test_tied_pvalues(): """Test whether k-best and percentiles work with tied pvalues from chi2.""" # chi2 will return the same p-values for the following features, but it # will return different scores. X0 = np.array([[10000, 9999, 9998], [1, 1, 1]]) y = [0, 1] for perm in itertools.permutations((0, 1, 2)): X = X0[:, perm] Xt = SelectKBest(chi2, k=2).fit_transform(X, y) assert_equal(Xt.shape, (2, 2)) assert_not_in(9998, Xt) Xt = SelectPercentile(chi2, percentile=67).fit_transform(X, y) assert_equal(Xt.shape, (2, 2)) assert_not_in(9998, Xt)
def test_fetch_one_column(): _urllib2_ref = datasets.mldata.urllib2 try: dataname = 'onecol' # create fake data set in cache x = sp.arange(6).reshape(2, 3) datasets.mldata.urllib2 = mock_urllib2({dataname: {'x': x}}) dset = fetch_mldata(dataname, data_home=tmpdir) for n in ["COL_NAMES", "DESCR", "data"]: assert_in(n, dset) assert_not_in("target", dset) assert_equal(dset.data.shape, (2, 3)) assert_array_equal(dset.data, x) # transposing the data array dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdir) assert_equal(dset.data.shape, (3, 2)) finally: datasets.mldata.urllib2 = _urllib2_ref
def test_fetch_one_column(): _urlopen_ref = datasets.mldata.urlopen try: dataname = 'onecol' # create fake data set in cache x = sp.arange(6).reshape(2, 3) datasets.mldata.urlopen = mock_mldata_urlopen({dataname: {'x': x}}) dset = fetch_mldata(dataname, data_home=tmpdir) for n in ["COL_NAMES", "DESCR", "data"]: assert_in(n, dset) assert_not_in("target", dset) assert_equal(dset.data.shape, (2, 3)) assert_array_equal(dset.data, x) # transposing the data array dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdir) assert_equal(dset.data.shape, (3, 2)) finally: datasets.mldata.urlopen = _urlopen_ref
def test_fetch_multiple_column(): _urllib2_ref = datasets.mldata.urllib2 try: # create fake data set in cache x = sp.arange(6).reshape(2, 3) y = sp.array([1, -1]) z = sp.arange(12).reshape(4, 3) # by default dataname = 'threecol-default' datasets.mldata.urllib2 = mock_urllib2({ dataname: ({ 'label': y, 'data': x, 'z': z }, ['z', 'data', 'label']) }) dset = fetch_mldata(dataname, data_home=tmpdir) for n in ["COL_NAMES", "DESCR", "target", "data", "z"]: assert_in(n, dset) assert_not_in("x", dset) assert_not_in("y", dset) assert_array_equal(dset.data, x) assert_array_equal(dset.target, y) assert_array_equal(dset.z, z.T) # by order dataname = 'threecol-order' datasets.mldata.urllib2 = mock_urllib2( {dataname: ({ 'y': y, 'x': x, 'z': z }, ['y', 'x', 'z'])}) dset = fetch_mldata(dataname, data_home=tmpdir) for n in ["COL_NAMES", "DESCR", "target", "data", "z"]: assert_in(n, dset) assert_not_in("x", dset) assert_not_in("y", dset) assert_array_equal(dset.data, x) assert_array_equal(dset.target, y) assert_array_equal(dset.z, z.T) # by number dataname = 'threecol-number' datasets.mldata.urllib2 = mock_urllib2( {dataname: ({ 'y': y, 'x': x, 'z': z }, ['z', 'x', 'y'])}) dset = fetch_mldata(dataname, target_name=2, data_name=0, data_home=tmpdir) for n in ["COL_NAMES", "DESCR", "target", "data", "x"]: assert_in(n, dset) assert_not_in("y", dset) assert_not_in("z", dset) assert_array_equal(dset.data, z) assert_array_equal(dset.target, y) # by name dset = fetch_mldata(dataname, target_name='y', data_name='z', data_home=tmpdir) for n in ["COL_NAMES", "DESCR", "target", "data", "x"]: assert_in(n, dset) assert_not_in("y", dset) assert_not_in("z", dset) finally: datasets.mldata.urllib2 = _urllib2_ref
def test_fetch_multiple_column(): _urlopen_ref = datasets.mldata.urlopen try: # create fake data set in cache x = sp.arange(6).reshape(2, 3) y = sp.array([1, -1]) z = sp.arange(12).reshape(4, 3) # by default dataname = 'threecol-default' datasets.mldata.urlopen = mock_mldata_urlopen({ dataname: ( { 'label': y, 'data': x, 'z': z, }, ['z', 'data', 'label'], ), }) dset = fetch_mldata(dataname, data_home=tmpdir) for n in ["COL_NAMES", "DESCR", "target", "data", "z"]: assert_in(n, dset) assert_not_in("x", dset) assert_not_in("y", dset) assert_array_equal(dset.data, x) assert_array_equal(dset.target, y) assert_array_equal(dset.z, z.T) # by order dataname = 'threecol-order' datasets.mldata.urlopen = mock_mldata_urlopen({ dataname: ({'y': y, 'x': x, 'z': z}, ['y', 'x', 'z']), }) dset = fetch_mldata(dataname, data_home=tmpdir) for n in ["COL_NAMES", "DESCR", "target", "data", "z"]: assert_in(n, dset) assert_not_in("x", dset) assert_not_in("y", dset) assert_array_equal(dset.data, x) assert_array_equal(dset.target, y) assert_array_equal(dset.z, z.T) # by number dataname = 'threecol-number' datasets.mldata.urlopen = mock_mldata_urlopen({ dataname: ({'y': y, 'x': x, 'z': z}, ['z', 'x', 'y']), }) dset = fetch_mldata(dataname, target_name=2, data_name=0, data_home=tmpdir) for n in ["COL_NAMES", "DESCR", "target", "data", "x"]: assert_in(n, dset) assert_not_in("y", dset) assert_not_in("z", dset) assert_array_equal(dset.data, z) assert_array_equal(dset.target, y) # by name dset = fetch_mldata(dataname, target_name='y', data_name='z', data_home=tmpdir) for n in ["COL_NAMES", "DESCR", "target", "data", "x"]: assert_in(n, dset) assert_not_in("y", dset) assert_not_in("z", dset) finally: datasets.mldata.urlopen = _urlopen_ref