예제 #1
0
def test_csv_load_from_source_permute_split(monkeypatch, filename, filename_merge):
    def mock_S3Key_get(key):
        return filename_merge

    dataset = data_csv.GeneralDataset()

    params = dataset_params.copy()    
    params['filters'] = [ {'name': 'permute', 'columns': [3]},
                          {'name': 'split',   'start': 0, 'end': 80} ]
    dataset_file = dataset.load_from_source(filename, **params)
    dataset.load(dataset_file=dataset_file)
    assert dataset.data.shape == (6, 9)
    assert dataset.output is None

    params = dataset_merge_params.copy()
    params['filters'] = [ {'name': 'permute', 'columns': [0]},
                          {'name': 'split',   'start': 60, 'end': 100} ]
    dataset_file = dataset.load_from_source(filename_merge, **params)
    dataset.load(dataset_file=dataset_file)
    assert dataset.data.shape == (3, 8)
    assert dataset.output is None

    params = dataset_merge_params.copy()
    params['filters'] = [ {'name': 'outputs', 'columns': [2, 3]},
                          {'name': 'permute', 'columns': [0]},
                          {'name': 'split',   'start': 0, 'end': 70} ]
    dataset_file = dataset.load_from_source(filename_merge, **params)
    dataset.load(dataset_file=dataset_file)
    assert dataset.data.shape == (4, 6)
    assert dataset.output.shape == (4, 2)
예제 #2
0
def test_csv_load_from_lines_text_text_data():
    lines = "key,0.6398986743,60,C\nvalue,0.3999189187,6,A\niter,M,74,B\nnames,0.3242075364,15,A\n"+ \
      "any,0.9294471992,28,C\nother,0.503215279,90,A\nvalue,0.6827588778,96,A\nkeyword,0.5677838973,28,C\n" + \
      "new,0.131201321,53,B\nnames,0.5928324999,10,B\noverwrite,0.7019303145,38,C\n" + \
      "field,0.7117090842,71,C\ndefined,0.6661604231,60,A"
    kwargs = {
        'num_columns': 4,
        'dtypes': ['S', 'f', 'i', 'S'],
        'classes': [
            ['any', 'defined', 'field', 'iter', 'key', 'keyword', 'names', 'new', 'other', 'overwrite', 'value'],
            [], [], ['A', 'B', 'C']
        ]
    }
    dataset = data_csv.GeneralDataset()
    dataset.load_from_lines(lines, with_output=False, **kwargs)
    result = np.array([
        [ 4, 0.63989867, 60, 2],
        [10, 0.39991892,  6, 0],
        [ 6, 0.32420754, 15, 0],
        [ 0, 0.9294472,  28, 2],
        [ 8, 0.50321528, 90, 0],
        [10, 0.68275888, 96, 0],
        [ 5, 0.5677839,  28, 2],
        [ 7, 0.13120132, 53, 1],
        [ 6, 0.5928325,  10, 1],
        [ 9, 0.70193031, 38, 2],
        [ 2, 0.71170908, 71, 2],
        [ 1, 0.66616042, 60, 0]
    ])
    assert np.allclose(dataset.data, result)
예제 #3
0
def test_csv_load_from_source_merge(monkeypatch, filename, filename_merge):
    def mock_S3Key_get(key):
        return filename_merge

    dataset = data_csv.GeneralDataset()
    params = dataset_params.copy()
    params['filters'] = [
        {'name': 'outputs', 'columns': []},
        {'name': 'merge', 'datas': [filename_merge]}
    ]
    dataset_file = dataset.load_from_source(filename, **params)
    dataset.load(dataset_file=dataset_file)
    assert dataset.data.shape == (14, 4)
예제 #4
0
def test_csv_load_from_source_merge_with_output(monkeypatch, filename, filename_merge):
    def mock_S3Key_get(key):
        return filename_merge

    monkeypatch.setattr(S3Key, 'get', mock_S3Key_get)
    dataset = data_csv.GeneralDataset()
    params = dataset_params.copy()
    params['filters'] = [
        {'name': 'outputs', 'columns': ['2', '3']},
        {'name': 'merge', 'datas': [filename_merge]}
    ]
    dataset_file = dataset.load_from_source(filename, **params)
    assert dataset.data.shape == (14, 2)
    assert dataset.output.shape == (14, 2)
예제 #5
0
def test_csv_load_from_source_ignore_permute(filename):
    dataset = data_csv.GeneralDataset()
    params = dataset_params.copy()
    params['filters'] = [{'name': 'ignore',  'columns': [1, 2]},
                         {'name': 'permute', 'columns': [3]}]
    dataset_file = dataset.load_from_source(filename, **params)
    dataset.load(dataset_file=dataset_file)
    result = np.array([
        [ 0.40900001, 0., 0., 0., 1., 0., 0. ],
        [ 0.917,      1., 0., 0., 0., 0., 0. ],
        [ 0.27599999, 0., 1., 0., 0., 0., 0. ],
        [ 0.27399999, 0., 0., 0., 0., 0., 1. ],
        [ 22.,        1., 0., 0., 0., 0., 0. ],
        [ 51.,        1., 0., 0., 0., 0., 0. ],
        [ 69.,        0., 0., 0., 0., 0., 0. ],
        [  1.,        1., 0., 0., 0., 0., 0. ]
    ])
    assert np.allclose(dataset.data, result)
예제 #6
0
def test_csv_load_from_source_split(monkeypatch, filename):
    dataset = data_csv.GeneralDataset()
    params = dataset_params.copy()
    params['filters'] = [
        {'name': 'outputs', 'columns': []},
        {'name': 'split', 'start': 0, 'end': 80}
    ]
    dataset_file = dataset.load_from_source(filename, **params)
    dataset.load(dataset_file=dataset_file)
    assert dataset.data.shape == (6, 4)
    assert dataset.output is None

    params['filters'] = [
        {'name': 'outputs', 'columns': []},
        {'name': 'split', 'start': 80, 'end': 100}
    ]
    dataset_file = dataset.load_from_source(filename, **params)
    dataset.load(dataset_file=dataset_file)
    assert dataset.data.shape == (2, 4)
예제 #7
0
def test_csv_load_from_source_permute(monkeypatch, filename):
    dataset = data_csv.GeneralDataset()
    params = dataset_params.copy()
    params['filters'] = [
        {'name': 'outputs', 'columns': []},
        {'name': 'permute', 'columns': ['3']},
        ]
    dataset_file = dataset.load_from_source(filename, **params)
    dataset.load(dataset_file=dataset_file)
    result = np.array([
        [4.09000000e-01, 7.96000000e-01, 6.90000000e-01, 0., 0., 0., 1., 0., 0.],
        [9.17000000e-01, 9.11000000e-01, 5.32000000e-01, 1., 0., 0., 0., 0., 0.],
        [2.76000000e-01, 7.57000000e-01, 9.27000000e-01, 0., 1., 0., 0., 0., 0.],
        [2.74000000e-01, 5.50000000e-02, 6.64000000e-01, 0., 0., 0., 0., 0., 1.],
        [2.20000000e+01, 7.00000000e+00, 0.00000000e+00, 1., 0., 0., 0., 0., 0.],
        [5.10000000e+01, 4.80000000e+01, 3.80000000e+01, 1., 0., 0., 0., 0., 0.],
        [6.90000000e+01, 4.00000000e+01, 1.60000000e+01, 0., 0., 0., 0., 0., 0.],
        [1.00000000e+00, 2.80000000e+01, 9.80000000e+01, 1., 0., 0., 0., 0., 0.]
    ])
    assert np.allclose(dataset.data, result)
예제 #8
0
def test_csv_load_from_source_normalize(filename):
    dataset = data_csv.GeneralDataset()
    params = dataset_params.copy()
    params['filters'] = [
        {'name': 'outputs', 'columns': []},
        {'name': 'normalize'},
        ]
    dataset_file = dataset.load_from_source(filename, **params)
    dataset.load(dataset_file=dataset_file)
    result = np.array([
        [  1.96432206e-03,  1.54552087e-02,  7.04081636e-03,  6.66666687e-01],
        [  9.35599301e-03,  1.78537909e-02,  5.42857125e-03,  1.66666672e-01],
        [  2.91010674e-05,  1.46417767e-02,  9.45918355e-03,  3.33333343e-01],
        [  0.00000000e+00,  0.00000000e+00,  6.77551003e-03,  1.00000000e+00],
        [  3.16124916e-01,  1.44853473e-01,  0.00000000e+00,  1.66666672e-01],
        [  7.38090396e-01,  1.00000000e+00,  3.87755096e-01,  1.66666672e-01],
        [  1.00000000e+00,  8.33142161e-01,  1.63265303e-01,  0.00000000e+00],
        [  1.05636874e-02,  5.82855344e-01,  1.00000000e+00,  1.66666672e-01]
    ])
    assert np.allclose(dataset.data, result)
예제 #9
0
def test_csv_load_from_source_shuffle(filename):
    dataset = data_csv.GeneralDataset()
    params = dataset_params.copy()
    params['filters'] = [
        {'name': 'outputs', 'columns': []},
        {'name': 'shuffle'},
    ]
    dataset_file = dataset.load_from_source(filename, **params)
    dataset.load(dataset_file=dataset_file)
    result = np.array([
        [  2.20000000e+01,  7.00000000e+00,  0.00000000e+00,  1.00000000e+00],
        [  2.73999989e-01,  5.49999997e-02,  6.63999975e-01,  6.00000000e+00],
        [  1.00000000e+00,  2.80000000e+01,  9.80000000e+01,  1.00000000e+00],
        [  6.90000000e+01,  4.00000000e+01,  1.60000000e+01,  0.00000000e+00],
        [  4.09000009e-01,  7.96000004e-01,  6.89999998e-01,  4.00000000e+00],
        [  5.10000000e+01,  4.80000000e+01,  3.80000000e+01,  1.00000000e+00],
        [  2.75999993e-01,  7.57000029e-01,  9.26999986e-01,  2.00000000e+00],
        [  9.16999996e-01,  9.11000013e-01,  5.32000005e-01,  1.00000000e+00]
    ])
    assert np.allclose(dataset.data, result)
예제 #10
0
def test_csv_load_from_source_text_data(filename_text):
    dataset = data_csv.GeneralDataset()
    params = dataset_text_params.copy()
    params['filters'] = [
        {'name': 'outputs', 'columns': [4]},
    ]
    dataset_file = dataset.load_from_source(filename_text, **params)
    dataset.load(dataset_file=dataset_file)
    # NB: LINES_TEXT[5, 1] is missing. As all text data columns are permuted,
    # it is substituted with all zeros in order not to affect training.
    result = np.array([
        [  2.00000000e+03,  0.,  0.,  0.,  1.,  0.,  1.46851808e-01,  4.],
        [  3.40000000e+03,  0.,  0.,  1.,  0.,  0.,  8.65561426e-01,  1.],
        [  1.20000000e+02,  0.,  0.,  0.,  0.,  1.,  1.14752598e-01,  2.],
        [  8.00000000e+02,  1.,  0.,  0.,  0.,  0.,  2.78684437e-01,  6.],
        [  2.10000000e+03,  0.,  1.,  0.,  0.,  0.,  8.94018233e-01,  1.],
        [  2.20000000e+03,  0.,  0.,  0.,  0.,  0.,  7.81868279e-01,  1.],
        [  2.00000000e+02,  0.,  0.,  1.,  0.,  0.,  4.23042387e-01,  0.],
        [  1.80000000e+04,  0.,  1.,  0.,  0.,  0.,  8.16439629e-01,  1.]
    ])
    assert np.allclose(dataset.data, result)
예제 #11
0
def test_csv_load_from_source_balance(monkeypatch, filename, filename_merge):
    dataset = data_csv.GeneralDataset()

    params = dataset_params.copy()
    params['filters'] = [
        {'name': 'outputs', 'columns': []},
        {'name': 'balance', 'sample': 'undersampling'}
    ]
    dataset_file = dataset.load_from_source(filename, **params)
    dataset.load(dataset_file=dataset_file)
    assert dataset.data.shape == (8, 4)

    # FAILS!
    # The output is of the shape (6, 3), while it has to be (3, 3)
    # There are 3 classes in the output: 0, 1, 9. 'Undersampling' strategy
    # implies reducing dataset to the size, where all classes presented by the
    # same number of examples as the class with the minimum number. In this case
    # it is class 9, that presented by only 1 example, thus there should be 3 lines
    # in the result dataset.
    #
    params = dataset_merge_params.copy()
    params['filters'] = [
        {'name': 'outputs', 'columns': ['3']},
        {'name': 'balance', 'sample': 'undersampling'}
    ]
    dataset_file = dataset.load_from_source(filename_merge, **params)
    dataset.load(dataset_file=dataset_file)
    assert dataset.data.shape == (3, 3)
    assert np.all(np.sort(dataset.output.T[0]) == np.array([0, 1, 9]))

    # FAILS
    # The result for oversampling strategy with no split filter should consist of
    # all the examples from the original dataset plus additional examples, randomly
    # picked from the original dataset so that the number of examples of each class
    # becomes equal with the maximum number of examples of a certain class. In the
    # example of filename_merge this class is 1, but the output consists of the
    # examples of ONLY class 1:
    #
    # [  2.  29.  30.] [ 1 ]
    # [  1.   7.  89.] [ 1 ]
    # [  1.   7.  89.] [ 1 ]
    # [  4.  11.   8.] [ 1 ]
    # [  4.  11.   8.] [ 1 ]
    #
    # However it should look like this (doesn't have to be ordered the same way):
    #
    # [  2.  29.  30.] [ 1 ]
    # [  1.   7.  89.] [ 1 ]
    # [  4.  11.   8.] [ 1 ]
    # [  3.  92.   1.] [ 0 ]
    # [  5.  50.  46.] [ 0 ]
    # [  3.  92.   1.] [ 0 ]
    # [  0.  10.  92.] [ 9 ]
    # [  0.  10.  92.] [ 9 ]
    # [  0.  10.  92.] [ 9 ]
    #
    params = dataset_merge_params.copy()
    params['filters'] = [
        {'name': 'outputs', 'columns': ['3']},
        {'name': 'balance', 'sample': 'oversampling'}
    ]
    dataset_file = dataset.load_from_source(filename_merge, **params)
    dataset.load(dataset_file=dataset_file)
    output = np.array([0, 0, 0, 1, 1, 1, 9, 9, 9])
    assert dataset.data.shape == (9, 3)
    assert np.all(np.sort(dataset.output.T[0]) == output)

    # FAILS
    # The output is of the shape (5, 3), while it has to be (9, 3)
    # The same as above - it consists only of the examples of the most populated class 1
    # In case of 'uniform' balancing it is impossible to predict what exactly examples
    # will be chosen for the result dataset, but nevertheless all classes should be presented.
    #
    params = dataset_merge_params.copy()
    params['filters'] = [
        {'name': 'outputs', 'columns': ['3']},
        {'name': 'balance', 'sample': 'uniform'}
    ]
    dataset_file = dataset.load_from_source(filename_merge, **params)
    dataset.load(dataset_file=dataset_file)
    assert dataset.data.shape == (9, 3)
예제 #12
0
def test_csv_load_from_lines_invalid():
    lines = "x1,x2,x3,y\n30,37,67,50\n93,error,55,24\n90,68,72,59\n23,35,76,8"
    kwargs = {'num_columns': 4, 'dtypes': ['i', 'i', 'i', 'i']}
    dataset = data_csv.GeneralDataset()
    dataset.load_from_lines(lines, **kwargs)
    assert dataset.data.shape == (3, 4)
예제 #13
0
def test_csv_load_from_source(filename):
    dataset = data_csv.GeneralDataset()
    dataset_file = dataset.load_from_source(filename, **dataset_params)
    dataset.load(dataset_file=dataset_file)
    assert dataset.data.shape == (8, 4)
예제 #14
0
def test_csv_load_from_lines():
    dataset = data_csv.GeneralDataset()
    dataset.load_from_lines(LINES)
    assert dataset.data.shape == (8, 4)