예제 #1
0
    def split(self, x, y=None):
        """Generates K-fold splits.

        Parameters
        ----------
        x : ds-array
            Samples array.
        y : ds-array, optional (default=None)
            Corresponding labels or values.

        Yields
        ------
        train_data : train_x, train_y
            The training ds-arrays for that split. If y is None, train_y is
            None.
        test_data : test_x, test_y
            The testing ds-arrays data for that split. If y is None, test_y is
            None.
        """
        k = self.n_splits
        if self.shuffle:
            shuffled = utils.shuffle(x, y, self.random_state)
            if y is None:
                x = shuffled
            else:
                x, y = shuffled
        n_total = x.shape[0]
        n_each_section, extras = divmod(n_total, k)
        section_sizes = np.empty((k,), dtype=int)
        section_sizes[:extras] = n_each_section + 1
        section_sizes[extras:] = n_each_section
        div_points = np.cumsum(section_sizes)
        yield get_kfold_partition(x, y, 0, div_points[0])
        for i in range(1, k):
            yield get_kfold_partition(x, y, div_points[i - 1], div_points[i])
예제 #2
0
    def test_shuffle_xy_sparse(self):
        """ Tests shuffle for given sparse x and sparse y, and random_state.
        Tests that the shuffled arrays contain the same rows as the original
        data, and that the position has changed for some row.
        """
        np.random.seed(0)
        x = sparse.random(8, 10, density=0.5).tocsr()
        x_ds = ds.array(x, (3, 5))
        y = sparse.random(8, 1, density=0.5).tocsr()
        y_ds = ds.array(y, (4, 1))

        shuffled_x, shuffled_y = shuffle(x_ds, y_ds, random_state=0)
        shuffled_x = shuffled_x.collect()
        shuffled_y = shuffled_y.collect()

        # Assert that at least one of the first 2 samples has changed
        self.assertFalse((x[0:2] != shuffled_x[0:2]).nnz == 0)
        # Assert that the shuffled data has the same shape.
        self.assertEqual(shuffled_x.shape, x.shape)
        self.assertEqual(shuffled_y.shape[0], y.shape[0])
        # Assert that all rows from x are found in the shuffled_x, and that the
        # same permutation has been used to shuffle x and y.
        for idx, x_row in enumerate(x):
            found = False
            for shuffled_idx, shuffle_x_row in enumerate(shuffled_x):
                if (shuffle_x_row != x_row).nnz == 0:  # If rows are equal
                    found = True
                    self.assertEqual(y[idx, 0], shuffled_y[shuffled_idx, 0])
                    break
            self.assertTrue(found)
예제 #3
0
    def test_shuffle_xy(self):
        """ Tests shuffle for given x, y and random_state. Tests that the
        shuffled arrays contain the same rows as the original data,
        and that the position has changed for some row.
        """
        np.random.seed(0)
        x = np.random.rand(8, 3)
        y = np.random.rand(8, 1)
        x_ds = ds.array(x, (3, 2))
        y_ds = ds.array(y, (4, 1))

        shuffled_x, shuffled_y = shuffle(x_ds, y_ds, random_state=0)
        shuffled_x = shuffled_x.collect()
        shuffled_y = shuffled_y.collect()

        # Assert that at least one of the first 2 samples has changed
        self.assertFalse(np.array_equal(x[0:2], shuffled_x[0:2]))
        # Assert that the shuffled data has the same shape.
        self.assertEqual(shuffled_x.shape, x.shape)
        self.assertEqual(shuffled_y.shape[0], y.shape[0])
        # Assert that all rows from x are found in the shuffled_x, and that the
        # same permutation has been used to shuffle x and y.
        for idx, x_row in enumerate(x):
            found = False
            for shuffled_idx, shuffle_x_row in enumerate(shuffled_x):
                if (shuffle_x_row == x_row).all():
                    found = True
                    self.assertEqual(y[idx], shuffled_y[shuffled_idx])
                    break
            self.assertTrue(found)
예제 #4
0
def main():
    x_kdd = ds.load_txt_file(
        "/fefs/scratch/bsc19/bsc19029/PERFORMANCE/datasets/train.csv",
        block_size=(11482, 122))

    x_kdd = shuffle(x_kdd)
    y_kdd = x_kdd[:, 121:122]
    x_kdd = x_kdd[:, :121]

    csvm = CascadeSVM(c=10000, gamma=0.01)

    performance.measure("CSVM", "KDD99", csvm.fit, x_kdd, y_kdd)
예제 #5
0
def main():
    x_kdd = ds.load_txt_file(
        "/gpfs/projects/bsc19/COMPSs_DATASETS/dislib/kdd99/train.csv",
        block_size=(11482, 122))

    x_kdd = shuffle(x_kdd)
    y_kdd = x_kdd[:, 121:122]
    x_kdd = x_kdd[:, :121]

    x_ij, y_ij = ds.load_svmlight_file(
        "/gpfs/projects/bsc19/COMPSs_DATASETS/dislib/ijcnn1/train",
        block_size=(5000, 22), n_features=22, store_sparse=True)

    csvm = CascadeSVM(c=10000, gamma=0.01)

    performance.measure("CSVM", "KDD99", csvm.fit, x_kdd, y_kdd)
    performance.measure("CSVM", "ijcnn1", csvm.fit, x_ij, y_ij)
예제 #6
0
    def test_refit_false(self):
        """Tests GridSearchCV fit() with refit=False."""
        x_np, y_np = datasets.load_iris(return_X_y=True)
        x = ds.array(x_np, (30, 4))
        y = ds.array(y_np[:, np.newaxis], (30, 1))

        seed = 0
        x, y = shuffle(x, y, random_state=seed)

        param_grid = {'max_iter': range(1, 5)}
        csvm = CascadeSVM(check_convergence=False)
        searcher = GridSearchCV(csvm, param_grid, cv=3, refit=False)
        searcher.fit(x, y)

        self.assertFalse(hasattr(searcher, 'best_estimator_'))
        self.assertTrue(hasattr(searcher, 'best_score_'))
        self.assertTrue(hasattr(searcher, 'best_params_'))
        self.assertTrue(hasattr(searcher, 'best_index_'))
        self.assertTrue(hasattr(searcher, 'scorer_'))
        self.assertEqual(searcher.n_splits_, 3)
예제 #7
0
    def test_shuffle_x(self):
        """ Tests shuffle for given x and random_state. Tests that the
        shuffled array contains the same rows as the original data,
        and that the position has changed for some row.
        """
        x = np.random.rand(8, 3)
        x_ds = ds.array(x, (3, 2))

        shuffled_x = shuffle(x_ds, random_state=0)
        shuffled_x = shuffled_x.collect()

        # Assert that at least one of the first 2 samples has changed
        self.assertFalse(np.array_equal(x[0:2], shuffled_x[0:2]))
        # Assert that the shuffled data has the same shape.
        self.assertEqual(shuffled_x.shape, x.shape)
        # Assert that all rows from x are found in the shuffled_x.
        for x_row in x:
            found = False
            for shuffled_idx, shuffle_x_row in enumerate(shuffled_x):
                if (shuffle_x_row == x_row).all():
                    found = True
                    break
            self.assertTrue(found)
예제 #8
0
    def test_shuffle_x_sparse(self):
        """ Tests shuffle for given sparse x, and random_state. Tests that the
        shuffled array contains the same rows as the original data, and that
        the position has changed for some row.
        """
        np.random.seed(0)
        x = sparse.random(8, 10, density=0.5).tocsr()
        x_ds = ds.array(x, (3, 5))

        shuffled_x = shuffle(x_ds, random_state=0)
        shuffled_x = shuffled_x.collect()

        # Assert that at least one of the first 2 samples has changed
        self.assertFalse((x[0:2] != shuffled_x[0:2]).nnz == 0)
        # Assert that the shuffled data has the same shape.
        self.assertEqual(shuffled_x.shape, x.shape)
        # Assert that all rows from x are found in the shuffled_x.
        for x_row in x:
            found = False
            for shuffled_idx, shuffle_x_row in enumerate(shuffled_x):
                if (shuffle_x_row != x_row).nnz == 0:  # If rows are equal
                    found = True
                    break
            self.assertTrue(found)
예제 #9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--svmlight", help="read files in SVMLight format",
                        action="store_true")
    parser.add_argument("-dt", "--detailed_times",
                        help="get detailed execution times (read and fit)",
                        action="store_true")
    parser.add_argument("-k", "--kernel", metavar="KERNEL", type=str,
                        help="linear or rbf (default is rbf)",
                        choices=["linear", "rbf"], default="rbf")
    parser.add_argument("-a", "--arity", metavar="CASCADE_ARITY", type=int,
                        help="default is 2", default=2)
    parser.add_argument("-b", "--block_size", metavar="BLOCK_SIZE", type=str,
                        help="two comma separated ints that represent the "
                             "size of the blocks in which to divide the input "
                             "data (default is 100,100)",
                        default="100,100")
    parser.add_argument("-i", "--iteration", metavar="MAX_ITERATIONS",
                        type=int, help="default is 5", default=5)
    parser.add_argument("-g", "--gamma", metavar="GAMMA", type=float,
                        help="(only for rbf kernel) default is 1 / n_features",
                        default=None)
    parser.add_argument("-c", metavar="C", type=float, default=1,
                        help="Penalty parameter C of the error term. "
                             "Default:1")
    parser.add_argument("-f", "--features", metavar="N_FEATURES",
                        help="number of features of the input data "
                             "(only for SVMLight files)",
                        type=int, default=None, required=False)
    parser.add_argument("-t", "--test-file", metavar="TEST_FILE_PATH",
                        help="test file path", type=str, required=False)
    parser.add_argument("-o", "--output_file", metavar="OUTPUT_FILE_PATH",
                        help="output file path", type=str, required=False)
    parser.add_argument("--convergence", help="check for convergence",
                        action="store_true")
    parser.add_argument("--dense", help="store data in dense format (only "
                                        "for SVMLight files)",
                        action="store_true")
    parser.add_argument("train_data",
                        help="input file in CSV or SVMLight format", type=str)
    parser.add_argument("-v", "--verbose", action="store_true")
    parser.add_argument("-s", "--shuffle", help="shuffle input data",
                        action="store_true")
    args = parser.parse_args()

    train_data = args.train_data

    s_time = time.time()
    read_time = 0

    if not args.gamma:
        gamma = "auto"
    else:
        gamma = args.gamma

    sparse = not args.dense

    bsize = args.block_size.split(",")
    block_size = (int(bsize[0]), int(bsize[1]))

    if args.svmlight:
        x, y = ds.load_svmlight_file(train_data, block_size, args.features,
                                     sparse)
    else:
        x = ds.load_txt_file(train_data, block_size)
        y = x[:, x.shape[1] - 2: x.shape[1] - 1]
        x = x[:, :x.shape[1] - 1]

    if args.shuffle:
        x, y = shuffle(x, y)

    if args.detailed_times:
        barrier()
        read_time = time.time() - s_time
        s_time = time.time()

    csvm = CascadeSVM(cascade_arity=args.arity, max_iter=args.iteration,
                      c=args.c, gamma=gamma,
                      check_convergence=args.convergence, verbose=args.verbose)

    csvm.fit(x, y)

    barrier()
    fit_time = time.time() - s_time

    out = [args.kernel, args.arity, args.part_size, csvm._clf_params["gamma"],
           args.c, csvm.iterations, csvm.converged, read_time, fit_time]

    if os.path.isdir(train_data):
        n_files = os.listdir(train_data)
        out.append(len(n_files))

    if args.test_file:
        if args.svmlight:
            x_test, y_test = ds.load_svmlight_file(args.test_file, block_size,
                                                   args.features,
                                                   sparse)
        else:
            x_test = ds.load_txt_file(args.test_file, block_size)
            y_test = x_test[:, x_test.shape[1] - 1: x_test.shape[1]]
            x_test = x_test[:, :x_test.shape[1] - 1]

        out.append(compss_wait_on(csvm.score(x_test, y_test)))

    if args.output_file:
        with open(args.output_file, "ab") as f:
            wr = csv.writer(f)
            wr.writerow(out)
    else:
        print(out)