示例#1
0
    def test_index_rows_dense(self):
        """ Tests get a slice of rows from the ds.array using lists as index
        """
        config.session.execute("TRUNCATE TABLE hecuba.istorage")
        config.session.execute("DROP KEYSPACE IF EXISTS hecuba_dislib")

        bn, bm = 5, 5
        x = np.random.randint(100, size=(10, 10))
        ds_data = ds.array(x=x, block_size=(bn, bm))
        data = ds.array(x=x, block_size=(bn, bm))
        data.make_persistent(name="hecuba_dislib.test_array")

        indices_lists = [([0, 5], [0, 5])]

        for rows, cols in indices_lists:
            got = data[rows].collect()
            expected = ds_data[rows].collect()
            self.assertTrue(equal(got, expected))

        # Try slicing with irregular array
        x = ds_data[1:, 1:]
        data_sliced = data[1:, 1:]

        for rows, cols in indices_lists:
            got = data_sliced[rows].collect()
            expected = x[rows].collect()

            self.assertTrue(equal(got, expected))
示例#2
0
    def test_univariate(self):
        """Tests fit() and predict(), univariate."""
        x_data = np.array([1, 2, 3, 4, 5])
        y_data = np.array([2, 1, 1, 2, 4.5])

        bn, bm = 2, 1

        x = ds.array(x=x_data, block_size=(bn, bm))
        y = ds.array(x=y_data, block_size=(bn, bm))

        reg = LinearRegression()
        reg.fit(x, y)
        self.assertTrue(np.allclose(reg.coef_.collect(), 0.6))
        self.assertTrue(np.allclose(reg.intercept_.collect(), 0.3))

        # Predict one sample
        x_test = np.array([3])
        test_data = ds.array(x=x_test, block_size=(1, 1))
        pred = reg.predict(test_data).collect()
        self.assertTrue(np.allclose(pred, 2.1))

        # Predict multiple samples
        x_test = np.array([3, 5, 6])
        test_data = ds.array(x=x_test, block_size=(bn, bm))
        pred = reg.predict(test_data).collect()
        self.assertTrue(np.allclose(pred, [2.1, 3.3, 3.9]))
示例#3
0
    def test_multivariate(self):
        """Tests fit() and predict(), multivariate."""
        x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]])
        y_data = np.array([2, 1, 1, 2, 4.5])

        bn, bm = 2, 2

        x = ds.array(x=x_data, block_size=(bn, bm))
        y = ds.array(x=y_data, block_size=(bn, 1))

        reg = LinearRegression()
        reg.fit(x, y)
        self.assertTrue(np.allclose(reg.coef_.collect(), [0.421875, 0.296875]))
        self.assertTrue(np.allclose(reg.intercept_.collect(), 0.240625))

        # Predict one sample
        x_test = np.array([3, 2])
        test_data = ds.array(x=x_test, block_size=(1, bm))
        pred = reg.predict(test_data).collect()
        self.assertTrue(np.allclose(pred, 2.1))

        # Predict multiple samples
        x_test = np.array([[3, 2], [4, 4], [1, 3]])
        test_data = ds.array(x=x_test, block_size=(bn, bm))
        pred = reg.predict(test_data).collect()
        self.assertTrue(np.allclose(pred, [2.1, 3.115625, 1.553125]))
示例#4
0
    def test_fit_and_predict(self):
        """Tests LinearRegression's fit() and predict()"""
        x_data = np.array([1, 2, 3, 4, 5]).reshape(-1, 1)
        y_data = np.array([2, 1, 1, 2, 4.5]).reshape(-1, 1)

        bn, bm = 2, 2

        x = ds.array(x=x_data, block_size=(bn, bm))
        y = ds.array(x=y_data, block_size=(bn, bm))

        reg = LinearRegression()
        reg.fit(x, y)
        # y = 0.6 * x + 0.3

        reg.coef_ = compss_wait_on(reg.coef_)
        reg.intercept_ = compss_wait_on(reg.intercept_)

        self.assertTrue(np.allclose(reg.coef_, 0.6))
        self.assertTrue(np.allclose(reg.intercept_, 0.3))

        x_test = np.array([3, 5]).reshape(-1, 1)
        test_data = ds.array(x=x_test, block_size=(bn, bm))
        pred = reg.predict(test_data).collect()

        self.assertTrue(np.allclose(pred, [2.1, 3.3]))
示例#5
0
    def test_sparse(self):
        """ Tests fit_transforms with sparse data"""
        n_samples = 1500
        x, y = make_blobs(n_samples=n_samples, random_state=170)
        transformation = [[0.6, -0.6], [-0.4, 0.8]]
        x = np.dot(x, transformation)

        dense_arr = ds.array(x, block_size=(300, 2))
        sparse_arr = ds.array(csr_matrix(x), block_size=(300, 2))

        sc = StandardScaler()
        dense_scaled = sc.fit_transform(dense_arr)
        dense_mean = sc.mean_.collect()
        dense_var = sc.var_.collect()

        sparse_scaled = sc.fit_transform(sparse_arr)
        sparse_mean = sc.mean_.collect()
        sparse_var = sc.var_.collect()

        csr_scaled = sparse_scaled.collect()
        arr_scaled = dense_scaled.collect()

        self.assertTrue(issparse(csr_scaled))
        self.assertTrue(sparse_scaled._sparse)
        self.assertTrue(sc.var_._sparse)
        self.assertTrue(sc.mean_._sparse)
        self.assertTrue(issparse(sparse_mean))
        self.assertTrue(issparse(sparse_var))

        self.assertTrue(np.allclose(csr_scaled.toarray(), arr_scaled))
        self.assertTrue(np.allclose(sparse_mean.toarray(), dense_mean))
        self.assertTrue(np.allclose(sparse_var.toarray(), dense_var))
示例#6
0
    def test_kron(self, shape_a, shape_b, sparse):
        """ Tests kronecker product """
        np.random.seed()

        a_np = np.random.random(shape_a)
        b_np = np.random.random(shape_b)
        expected = np.kron(a_np, b_np)

        if sparse:
            a_np = sp.csr_matrix(a_np)
            b_np = sp.csr_matrix(b_np)

        b0 = np.random.randint(1, a_np.shape[0] + 1)
        b1 = np.random.randint(1, a_np.shape[1] + 1)
        b2 = np.random.randint(1, b_np.shape[0] + 1)
        b3 = np.random.randint(1, b_np.shape[1] + 1)

        a = ds.array(a_np, (b0, b1))
        b = ds.array(b_np, (b2, b3))

        b4 = np.random.randint(1, (b0 * b2) + 1)
        b5 = np.random.randint(1, (b1 * b3) + 1)

        computed = ds.kron(a, b, (b4, b5))

        self.assertTrue(_validate_array(computed))

        computed = computed.collect(False)

        # convert to ndarray because there is no kron for sparse matrices in
        # scipy
        if a._sparse:
            computed = computed.toarray()

        self.assertTrue(_equal_arrays(expected, computed))
示例#7
0
    def test_score(self, collect):
        seed = 666

        # negative points belong to class 1, positives to 0
        p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1]

        x = ds.array(np.array([p1, p4, p3, p2]), (2, 2))
        y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1))

        csvm = CascadeSVM(cascade_arity=3,
                          max_iter=10,
                          tol=1e-4,
                          kernel='rbf',
                          c=2,
                          gamma=0.1,
                          check_convergence=True,
                          random_state=seed,
                          verbose=False)

        csvm.fit(x, y)

        # points are separable, scoring the training dataset should have 100%
        # accuracy
        x_test = ds.array(np.array([p1, p2, p3, p4]), (2, 2))
        y_test = ds.array(np.array([0, 0, 1, 1]).reshape(-1, 1), (2, 1))

        accuracy = csvm.score(x_test, y_test, collect)
        if not collect:
            accuracy = compss_wait_on(accuracy)

        self.assertEqual(accuracy, 1.0)
示例#8
0
    def test_make_regression_sklearn_max_predict(self):
        """Tests RandomForestRegressor predict with sklearn_max."""
        x, y = make_regression(
            n_samples=3000,
            n_features=10,
            n_informative=4,
            shuffle=True,
            random_state=0,
        )
        x_train = ds.array(x[::2], (300, 10))
        y_train = ds.array(y[::2][:, np.newaxis], (300, 1))
        x_test = ds.array(x[1::2], (300, 10))
        y_test = ds.array(y[1::2][:, np.newaxis], (300, 1))

        rf = RandomForestRegressor(random_state=0, sklearn_max=10)

        rf.fit(x_train, y_train)
        accuracy1 = compss_wait_on(rf.score(x_test, y_test))

        y_pred = rf.predict(x_test).collect()
        y_true = y[1::2]
        accuracy2 = _determination_coefficient(y_true, y_pred)

        self.assertGreater(accuracy1, 0.85)
        self.assertGreater(accuracy2, 0.85)
        self.assertAlmostEqual(accuracy1, accuracy2)
示例#9
0
    def test_make_classification_sklearn_max_predict_proba(self):
        """Tests RandomForestClassifier predict_proba with sklearn_max."""
        x, y = make_classification(
            n_samples=3000,
            n_features=10,
            n_classes=3,
            n_informative=4,
            n_redundant=2,
            n_repeated=1,
            n_clusters_per_class=2,
            shuffle=True,
            random_state=0,
        )
        x_train = ds.array(x[::2], (300, 10))
        y_train = ds.array(y[::2][:, np.newaxis], (300, 1))
        x_test = ds.array(x[1::2], (300, 10))
        y_test = y[1::2]

        rf = RandomForestClassifier(random_state=0, sklearn_max=10)

        rf.fit(x_train, y_train)
        probabilities = rf.predict_proba(x_test).collect()
        rf.classes = compss_wait_on(rf.classes)
        y_pred = rf.classes[np.argmax(probabilities, axis=1)]
        accuracy = np.count_nonzero(y_pred == y_test) / len(y_test)
        self.assertGreater(accuracy, 0.7)
示例#10
0
def load_movielens(data_path, train_ratio=0.9):
    cols = ['user_id', 'movie_id', 'rating', 'timestamp']
    file = 'sample_movielens_ratings.csv'

    # 30 users, 100 movies
    df = pd.read_csv(os.path.join(data_path, file),
                     delimiter=',',
                     names=cols,
                     usecols=cols[0:3]).sample(frac=1, random_state=666)

    # just in case there are movies/user without rating
    n_m = max(df.movie_id.nunique(), max(df.movie_id) + 1)
    n_u = max(df.user_id.nunique(), max(df.user_id) + 1)

    idx = int(df.shape[0] * train_ratio)

    tr_df = df.iloc[:idx]
    te_df = df.iloc[idx:]

    train = csr_matrix((tr_df.rating, (tr_df.user_id, tr_df.movie_id)),
                       shape=(n_u, n_m))
    test = csr_matrix((te_df.rating, (te_df.user_id, te_df.movie_id)))

    x_size, y_size = ceil(train.shape[0] / 2), ceil(train.shape[1] / 3)
    train_arr = ds.array(train, block_size=(x_size, y_size))

    x_size, y_size = ceil(test.shape[0] / 2), ceil(test.shape[1] / 3)
    test_arr = ds.array(test, block_size=(x_size, y_size))

    return train_arr, test_arr
示例#11
0
    def test_median(self):
        """ Tests the median """
        x_np = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
        x = ds.array(x_np, block_size=(2, 2))
        xm = x.median()

        self.assertTrue(_validate_array(xm))

        expected = np.median(x_np, axis=0)

        self.assertTrue(_equal_arrays(expected, xm.collect()))

        xm = x.median(axis=1)

        self.assertTrue(_validate_array(xm))

        expected = np.median(x_np, axis=1)

        self.assertTrue(_equal_arrays(expected, xm.collect()))

        with self.assertRaises(NotImplementedError):
            x_csr = ds.array(sp.csr_matrix([[1, 2, 3],
                                            [4, 5, 6],
                                            [7, 8, 9]]), (2, 2))
            x_csr.median()
示例#12
0
def main():
    """
    Linear regression example with plot
    """

    # Example data
    x = np.array([
        1000, 4000, 5000, 4500, 3000, 4000, 9000, 11000, 15000, 12000, 7000,
        3000
    ])
    y = np.array([
        9914, 40487, 54324, 50044, 34719, 42551, 94871, 118914, 158484, 131348,
        78504, 36284
    ])
    x_ds = ds.array(x[:, np.newaxis], (4, 1))
    y_ds = ds.array(y[:, np.newaxis], (4, 1))
    reg = LinearRegression()
    reg.fit(x_ds, y_ds)
    reg.coef_ = compss_wait_on(reg.coef_)
    reg.intercept_ = compss_wait_on(reg.intercept_)
    print(reg.coef_, reg.intercept_)

    # plot_result:
    scatter(x, y, marker='x')
    x_mesh = np.linspace(min(x), max(x), 1000)
    plot(x_mesh, [reg.coef_ * x + reg.intercept_ for x in x_mesh])
    show()
示例#13
0
    def test_knn_fit(self):
        """ Tests knn fit_predict and compares the result with
            regular ds-arrays """
        config.session.execute("TRUNCATE TABLE hecuba.istorage")
        config.session.execute("DROP KEYSPACE IF EXISTS hecuba_dislib")

        x = np.random.random((1500, 5))
        block_size = (500, 5)
        block_size2 = (250, 5)

        data = ds.array(x, block_size=block_size)
        q_data = ds.array(x, block_size=block_size2)

        data_h = ds.array(x, block_size=block_size)
        data_h.make_persistent(name="hecuba_dislib.test_array")
        q_data_h = ds.array(x, block_size=block_size2)
        q_data_h.make_persistent(name="hecuba_dislib.test_array_q")

        knn = NearestNeighbors(n_neighbors=10)
        knn.fit(data)
        dist, ind = knn.kneighbors(q_data)

        knn_h = NearestNeighbors(n_neighbors=10)
        knn_h.fit(data_h)
        dist_h, ind_h = knn_h.kneighbors(q_data_h)

        self.assertTrue(
            np.allclose(dist.collect(), dist_h.collect(), atol=1e-7))
        self.assertTrue(np.array_equal(ind.collect(), ind_h.collect()))
示例#14
0
    def test_linear_regression(self):
        """ Tests linear regression fit_predict and compares the result with
            regular ds-arrays """
        config.session.execute("TRUNCATE TABLE hecuba.istorage")
        config.session.execute("DROP KEYSPACE IF EXISTS hecuba_dislib")

        x_data = np.array([1, 2, 3, 4, 5]).reshape(-1, 1)
        y_data = np.array([2, 1, 1, 2, 4.5]).reshape(-1, 1)

        block_size = (x_data.shape[0] // 3, x_data.shape[1])

        x = ds.array(x=x_data, block_size=block_size)
        x.make_persistent(name="hecuba_dislib.test_array_x")
        y = ds.array(x=y_data, block_size=block_size)
        y.make_persistent(name="hecuba_dislib.test_array_y")

        reg = LinearRegression()
        reg.fit(x, y)
        # y = 0.6 * x + 0.3

        reg.coef_ = compss_wait_on(reg.coef_)
        reg.intercept_ = compss_wait_on(reg.intercept_)
        self.assertTrue(np.allclose(reg.coef_, 0.6))
        self.assertTrue(np.allclose(reg.intercept_, 0.3))

        x_test = np.array([3, 5]).reshape(-1, 1)
        test_data = ds.array(x=x_test, block_size=block_size)
        test_data.make_persistent(name="hecuba_dislib.test_array_test")
        pred = reg.predict(test_data).collect()
        self.assertTrue(np.allclose(pred, [2.1, 3.3]))
示例#15
0
def load_movielens(train_ratio=0.9):
    file = 'tests/files/sample_movielens_ratings.csv'

    # 'user_id', 'movie_id', 'rating', 'timestamp'

    data = np.genfromtxt(file, dtype='int', delimiter=',', usecols=range(3))

    # just in case there are movies/user without rating
    # movie_id
    n_m = max(len(np.unique(data[:, 1])), max(data[:, 1]) + 1)
    # user_id
    n_u = max(len(np.unique(data[:, 0])), max(data[:, 0]) + 1)

    idx = int(data.shape[0] * train_ratio)

    train_data = data[:idx]
    test_data = data[idx:]

    train = csr_matrix(
        (train_data[:, 2], (train_data[:, 0], train_data[:, 1])),
        shape=(n_u, n_m))

    test = csr_matrix(
        (test_data[:, 2], (test_data[:, 0], test_data[:, 1])))

    x_size, y_size = train.shape[0] // 4, train.shape[1] // 4
    train_arr = ds.array(train, block_size=(x_size, y_size))

    x_size, y_size = test.shape[0] // 4, test.shape[1] // 4
    test_arr = ds.array(test, block_size=(x_size, y_size))

    return train_arr, test_arr
示例#16
0
    def test_make_classification_hard_vote_predict(self):
        """Tests RandomForestClassifier predict with hard_vote."""
        x, y = make_classification(
            n_samples=3000,
            n_features=10,
            n_classes=3,
            n_informative=4,
            n_redundant=2,
            n_repeated=1,
            n_clusters_per_class=2,
            shuffle=True,
            random_state=0,
        )
        x_train = ds.array(x[::2], (300, 10))
        y_train = ds.array(y[::2][:, np.newaxis], (300, 1))
        x_test = ds.array(x[1::2], (300, 10))
        y_test = y[1::2]

        rf = RandomForestClassifier(random_state=0,
                                    sklearn_max=10,
                                    hard_vote=True)

        rf.fit(x_train, y_train)
        y_pred = rf.predict(x_test).collect()
        accuracy = np.count_nonzero(y_pred == y_test) / len(y_test)
        self.assertGreater(accuracy, 0.7)
示例#17
0
    def test_power(self):
        """ Tests ds-array power and sqrt """
        orig = np.array([[1, 2, 3], [4, 5, 6]])
        x = ds.array(orig, block_size=(2, 1))
        xp = x ** 2
        xs = xp.sqrt()

        self.assertTrue(_validate_array(xp))
        self.assertTrue(_validate_array(xs))

        expected = np.array([[1, 4, 9], [16, 25, 36]])

        self.assertTrue(_equal_arrays(expected, xp.collect()))
        self.assertTrue(_equal_arrays(orig, xs.collect()))

        orig = sp.csr_matrix([[1, 2, 3], [4, 5, 6]])
        x = ds.array(orig, block_size=(2, 1))
        xp = x ** 2
        xs = xp.sqrt()

        self.assertTrue(_validate_array(xp))
        self.assertTrue(_validate_array(xs))

        expected = sp.csr_matrix([[1, 4, 9], [16, 25, 36]])

        self.assertTrue(_equal_arrays(expected, xp.collect()))
        self.assertTrue(_equal_arrays(orig, xs.collect()))

        with self.assertRaises(NotImplementedError):
            x ** x
示例#18
0
def main():
    x, y = load_iris(return_X_y=True)

    indices = np.arange(len(x))
    shuffle(indices)

    # use 80% of samples for training
    train_idx = indices[:int(0.8 * len(x))]
    test_idx = indices[int(0.8 * len(x)):]

    # Train the RF classifier
    print("- Training Random Forest classifier with %s samples of Iris "
          "dataset." % len(train_idx))
    x_train = ds.array(x[train_idx], (10, 4))
    y_train = ds.array(y[train_idx][:, np.newaxis], (10, 1))
    forest = RandomForestClassifier(10)
    forest.fit(x_train, y_train)

    # Test the trained RF classifier
    print("- Testing the classifier.", end='')
    x_test = ds.array(x[test_idx], (10, 4))
    y_real = ds.array(y[test_idx][:, np.newaxis], (10, 1))
    y_pred = forest.predict(x_test)

    score = compss_wait_on(forest.score(x_test, y_real))

    # Put results in fancy dataframe and print the accuracy
    df = pd.DataFrame(data=list(zip(y[test_idx], y_pred.collect())),
                      columns=['Label', 'Predicted'])
    print(" Predicted values: \n\n%s" % df)
    print("\n- Classifier accuracy: %s" % score)
示例#19
0
    def test_predict(self):
        seed = 666

        # negative points belong to class 1, positives to 0
        p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1]

        x = ds.array(np.array([p1, p4, p3, p2]), (2, 2))
        y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1))

        csvm = CascadeSVM(cascade_arity=3,
                          max_iter=10,
                          tol=1e-4,
                          kernel='linear',
                          c=2,
                          gamma=0.1,
                          check_convergence=False,
                          random_state=seed,
                          verbose=False)

        csvm.fit(x, y)

        # p5 should belong to class 0, p6 to class 1
        p5, p6 = np.array([1, 1]), np.array([-1, -1])

        x_test = ds.array(np.array([p1, p2, p3, p4, p5, p6]), (2, 2))

        y_pred = csvm.predict(x_test)

        l1, l2, l3, l4, l5, l6 = y_pred.collect()

        self.assertTrue(l1 == l2 == l5 == 0)
        self.assertTrue(l3 == l4 == l6 == 1)
示例#20
0
def load_movielens(train_ratio=0.9):
    file = 'tests/files/sample_movielens_ratings.csv'

    cols = ['user_id', 'movie_id', 'rating', 'timestamp']

    # 30 users, 100 movies
    df = pd.read_csv(file, names=cols, usecols=cols[0:3])

    # just in case there are movies/user without rating
    n_m = max(df.movie_id.nunique(), max(df.movie_id) + 1)
    n_u = max(df.user_id.nunique(), max(df.user_id) + 1)

    idx = int(df.shape[0] * train_ratio)

    train_df = df.iloc[:idx]
    test_df = df.iloc[idx:]

    train = csr_matrix(
        (train_df.rating, (train_df.user_id, train_df.movie_id)),
        shape=(n_u, n_m))
    test = csr_matrix(
        (test_df.rating, (test_df.user_id, test_df.movie_id)))

    x_size, y_size = train.shape[0] // 4, train.shape[1] // 4
    train_arr = ds.array(train, block_size=(x_size, y_size))

    x_size, y_size = test.shape[0] // 4, test.shape[1] // 4
    test_arr = ds.array(test, block_size=(x_size, y_size))

    return train_arr, test_arr
示例#21
0
    def test_make_classification_hard_vote_score_mix(self):
        """Tests RandomForestClassifier score with hard_vote, sklearn_max,
        distr_depth and max_depth."""
        x, y = make_classification(n_samples=3000,
                                   n_features=10,
                                   n_classes=3,
                                   n_informative=4,
                                   n_redundant=2,
                                   n_repeated=1,
                                   n_clusters_per_class=2,
                                   shuffle=True,
                                   random_state=0)
        x_train = ds.array(x[:len(x) // 2], (300, 10))
        y_train = ds.array(y[:len(y) // 2][:, np.newaxis], (300, 1))
        x_test = ds.array(x[len(x) // 2:], (300, 10))
        y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1))

        rf = RandomForestClassifier(random_state=0,
                                    sklearn_max=100,
                                    distr_depth=2,
                                    max_depth=12,
                                    hard_vote=True)

        rf.fit(x_train, y_train)
        accuracy = compss_wait_on(rf.score(x_test, y_test))
        self.assertGreater(accuracy, 0.7)
示例#22
0
def main():
    x_np, y_np = datasets.load_iris(return_X_y=True)
    x = ds.array(x_np, (30, 4))
    y = ds.array(y_np[:, np.newaxis], (30, 1))
    parameters = {
        'n_estimators': (1, 2, 4, 8, 16, 32),
        'max_depth': range(3, 5)
    }
    rf = RandomForestClassifier()
    searcher = GridSearchCV(rf, parameters, cv=5)
    np.random.seed(0)
    searcher.fit(x, y)
    print(searcher.cv_results_['params'])
    print(searcher.cv_results_['mean_test_score'])
    pd_df = pd.DataFrame.from_dict(searcher.cv_results_)
    print(pd_df[['params', 'mean_test_score']])
    with pd.option_context('display.max_rows', None, 'display.max_columns',
                           None):
        print(pd_df)
    print(searcher.best_estimator_)
    print(searcher.best_score_)
    print(searcher.best_params_)
    print(searcher.best_index_)
    print(searcher.scorer_)
    print(searcher.n_splits_)
示例#23
0
    def test_fit_predict(self):
        """ Tests fit and predicts methods """

        np.random.seed(42)

        n_samples, n_features = 50, 100
        X = np.random.randn(n_samples, n_features)

        # Decreasing coef w. alternated signs for visualization
        idx = np.arange(n_features)
        coef = (-1)**idx * np.exp(-idx / 10)
        coef[10:] = 0  # sparsify coef
        y = np.dot(X, coef)

        # Add noise
        y += 0.01 * np.random.normal(size=n_samples)

        n_samples = X.shape[0]
        X_train, y_train = X[:n_samples // 2], y[:n_samples // 2]
        X_test, y_test = X[n_samples // 2:], y[n_samples // 2:]

        lasso = Lasso(lmbd=0.1, max_iter=50)

        lasso.fit(ds.array(X_train, (5, 100)), ds.array(y_train, (5, 1)))
        y_pred_lasso = lasso.predict(ds.array(X_test, (25, 100)))
        r2_score_lasso = r2_score(y_test, y_pred_lasso.collect())

        self.assertEqual(r2_score_lasso, 0.9481746925431124)
示例#24
0
    def test_scoring_callable(self):
        """Tests GridSearchCV with callable scoring parameter."""
        x_np, y_np = datasets.load_iris(return_X_y=True)
        x = ds.array(x_np, (30, 4))
        y = ds.array(y_np[:, np.newaxis], (30, 1))

        param_grid = {'n_estimators': (2, 4)}
        rf = RandomForestClassifier()

        def scoring(clf, x_score, y_real):
            return clf.score(x_score, y_real)

        searcher = GridSearchCV(rf, param_grid, cv=3, scoring=scoring)
        searcher.fit(x, y)

        self.assertTrue(hasattr(searcher, 'cv_results_'))
        self.assertTrue(hasattr(searcher, 'best_estimator_'))
        self.assertTrue(hasattr(searcher, 'best_score_'))
        self.assertTrue(hasattr(searcher, 'best_params_'))
        self.assertTrue(hasattr(searcher, 'best_index_'))
        self.assertTrue(hasattr(searcher, 'scorer_'))

        def invalid_scoring(clf, x_score, y_score):
            return '2'

        searcher = GridSearchCV(rf, param_grid, cv=3, scoring=invalid_scoring)
        with self.assertRaisesRegex(ValueError,
                                    'scoring must return a number'):
            searcher.fit(x, y)
示例#25
0
    def test_sparse(self, feature_range):
        """ Tests fit_transforms with sparse data"""
        n_samples = 1500
        x, y = make_blobs(n_samples=n_samples, random_state=170)
        transformation = [[0.6, -0.6], [-0.4, 0.8]]
        x = np.dot(x, transformation)

        dense_arr = ds.array(x, block_size=(300, 2))
        sparse_arr = ds.array(csr_matrix(x), block_size=(300, 2))

        sc = MinMaxScaler(feature_range=feature_range)
        dense_scaled = sc.fit_transform(dense_arr)
        dense_min = sc.data_min_.collect()
        dense_max = sc.data_max_.collect()

        sparse_scaled = sc.fit_transform(sparse_arr)
        sparse_min = sc.data_min_.collect()
        sparse_max = sc.data_max_.collect()

        csr_scaled = sparse_scaled.collect()
        arr_scaled = dense_scaled.collect()

        self.assertTrue(issparse(csr_scaled))
        self.assertTrue(sparse_scaled._sparse)
        self.assertTrue(sc.data_min_._sparse)
        self.assertTrue(sc.data_max_._sparse)
        self.assertTrue(issparse(sparse_min))
        self.assertTrue(issparse(sparse_max))

        self.assertTrue(np.allclose(csr_scaled.toarray(), arr_scaled))
        self.assertTrue(np.allclose(sparse_min.toarray(), dense_min))
        self.assertTrue(np.allclose(sparse_max.toarray(), dense_max))
示例#26
0
    def test_refit_callable(self):
        """Tests GridSearchCV with callable refit parameter."""
        x_np, y_np = datasets.load_iris(return_X_y=True)
        x = ds.array(x_np, (30, 4))
        y = ds.array(y_np[:, np.newaxis], (30, 1))
        param_grid = {'n_estimators': (2, 4)}
        rf = RandomForestClassifier()

        best_index = 1

        def refit(results):
            return best_index

        searcher = GridSearchCV(rf, param_grid, cv=3, refit=refit)
        searcher.fit(x, y)

        self.assertTrue(hasattr(searcher, 'cv_results_'))
        self.assertTrue(hasattr(searcher, 'best_estimator_'))
        self.assertFalse(hasattr(searcher, 'best_score_'))
        self.assertTrue(hasattr(searcher, 'best_params_'))
        self.assertTrue(hasattr(searcher, 'best_index_'))
        self.assertTrue(hasattr(searcher, 'scorer_'))

        best_index = 'str'
        searcher = GridSearchCV(rf, param_grid, cv=3, refit=refit)
        with self.assertRaises(TypeError):
            searcher.fit(x, y)

        best_index = -1
        searcher = GridSearchCV(rf, param_grid, cv=3, refit=refit)
        with self.assertRaises(IndexError):
            searcher.fit(x, y)
示例#27
0
    def test_univariate_no_intercept(self):
        """Tests fit() and predict(), univariate, fit_intercept=False."""
        x_data = np.array([1, 2, 3, 4, 5])
        y_data = np.array([2, 1, 1, 2, 4.5])

        bn, bm = 2, 1

        x = ds.array(x=x_data, block_size=(bn, bm))
        y = ds.array(x=y_data, block_size=(bn, bm))

        reg = LinearRegression(fit_intercept=False)
        reg.fit(x, y)
        self.assertTrue(np.allclose(reg.coef_.collect(), 0.68181818))
        self.assertTrue(np.allclose(reg.intercept_.collect(), 0))

        # Predict one sample
        x_test = np.array([3])
        test_data = ds.array(x=x_test, block_size=(1, 1))
        pred = reg.predict(test_data).collect()
        self.assertTrue(np.allclose(pred, 2.04545455))

        # Predict multiple samples
        x_test = np.array([3, 5, 6])
        test_data = ds.array(x=x_test, block_size=(bn, bm))
        pred = reg.predict(test_data).collect()
        self.assertTrue(np.allclose(pred, [2.04545455, 3.4090909, 4.0909091]))
示例#28
0
    def test_fit(self):
        """Tests GridSearchCV fit()."""
        x_np, y_np = datasets.load_iris(return_X_y=True)
        x = ds.array(x_np, (30, 4))
        y = ds.array(y_np[:, np.newaxis], (30, 1))

        param_grid = {'n_estimators': (2, 4), 'max_depth': range(3, 5)}
        rf = RandomForestClassifier()

        searcher = GridSearchCV(rf, param_grid)
        searcher.fit(x, y)

        expected_keys = {
            'param_max_depth', 'param_n_estimators', 'params',
            'mean_test_score', 'std_test_score', 'rank_test_score'
        }
        split_keys = {'split%d_test_score' % i for i in range(5)}
        expected_keys.update(split_keys)
        self.assertSetEqual(set(searcher.cv_results_.keys()), expected_keys)

        expected_params = [(3, 2), (3, 4), (4, 2), (4, 4)]
        for params in searcher.cv_results_['params']:
            m = params['max_depth']
            n = params['n_estimators']
            self.assertIn((m, n), expected_params)
            expected_params.remove((m, n))
        self.assertEqual(len(expected_params), 0)

        self.assertTrue(hasattr(searcher, 'best_estimator_'))
        self.assertTrue(hasattr(searcher, 'best_score_'))
        self.assertTrue(hasattr(searcher, 'best_params_'))
        self.assertTrue(hasattr(searcher, 'best_index_'))
        self.assertTrue(hasattr(searcher, 'scorer_'))
        self.assertEqual(searcher.n_splits_, 5)
示例#29
0
    def test_multivariate_no_intercept(self):
        """Tests fit() and predict(), multivariate, fit_intercept=False."""
        x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]])
        y_data = np.array([2, 1, 1, 2, 4.5])

        bn, bm = 2, 2

        x = ds.array(x=x_data, block_size=(bn, bm))
        y = ds.array(x=y_data, block_size=(bn, 1))

        reg = LinearRegression(fit_intercept=False)
        reg.fit(x, y)
        self.assertTrue(
            np.allclose(reg.coef_.collect(), [0.48305085, 0.30367232]))
        self.assertTrue(np.allclose(reg.intercept_.collect(), 0))

        # Predict one sample
        x_test = np.array([3, 2])
        test_data = ds.array(x=x_test, block_size=(1, bm))
        pred = reg.predict(test_data).collect()
        self.assertTrue(np.allclose(pred, [2.05649718]))

        # Predict multiple samples
        x_test = np.array([[3, 2], [4, 4], [1, 3]])
        test_data = ds.array(x=x_test, block_size=(bn, bm))
        pred = reg.predict(test_data).collect()
        self.assertTrue(np.allclose(pred, [2.05649718, 3.14689266, 1.3940678]))
示例#30
0
    def test_shuffle_xy_sparse(self):
        """ Tests shuffle for given sparse x and sparse y, and random_state.
        Tests that the shuffled arrays contain the same rows as the original
        data, and that the position has changed for some row.
        """
        np.random.seed(0)
        x = sparse.random(8, 10, density=0.5).tocsr()
        x_ds = ds.array(x, (3, 5))
        y = sparse.random(8, 1, density=0.5).tocsr()
        y_ds = ds.array(y, (4, 1))

        shuffled_x, shuffled_y = shuffle(x_ds, y_ds, random_state=0)
        shuffled_x = shuffled_x.collect()
        shuffled_y = shuffled_y.collect()

        # Assert that at least one of the first 2 samples has changed
        self.assertFalse((x[0:2] != shuffled_x[0:2]).nnz == 0)
        # Assert that the shuffled data has the same shape.
        self.assertEqual(shuffled_x.shape, x.shape)
        self.assertEqual(shuffled_y.shape[0], y.shape[0])
        # Assert that all rows from x are found in the shuffled_x, and that the
        # same permutation has been used to shuffle x and y.
        for idx, x_row in enumerate(x):
            found = False
            for shuffled_idx, shuffle_x_row in enumerate(shuffled_x):
                if (shuffle_x_row != x_row).nnz == 0:  # If rows are equal
                    found = True
                    self.assertEqual(y[idx, 0], shuffled_y[shuffled_idx, 0])
                    break
            self.assertTrue(found)