def test_csv_load_from_source_permute_split(monkeypatch, filename, filename_merge): def mock_S3Key_get(key): return filename_merge dataset = data_csv.GeneralDataset() params = dataset_params.copy() params['filters'] = [ {'name': 'permute', 'columns': [3]}, {'name': 'split', 'start': 0, 'end': 80} ] dataset_file = dataset.load_from_source(filename, **params) dataset.load(dataset_file=dataset_file) assert dataset.data.shape == (6, 9) assert dataset.output is None params = dataset_merge_params.copy() params['filters'] = [ {'name': 'permute', 'columns': [0]}, {'name': 'split', 'start': 60, 'end': 100} ] dataset_file = dataset.load_from_source(filename_merge, **params) dataset.load(dataset_file=dataset_file) assert dataset.data.shape == (3, 8) assert dataset.output is None params = dataset_merge_params.copy() params['filters'] = [ {'name': 'outputs', 'columns': [2, 3]}, {'name': 'permute', 'columns': [0]}, {'name': 'split', 'start': 0, 'end': 70} ] dataset_file = dataset.load_from_source(filename_merge, **params) dataset.load(dataset_file=dataset_file) assert dataset.data.shape == (4, 6) assert dataset.output.shape == (4, 2)
def test_csv_load_from_lines_text_text_data(): lines = "key,0.6398986743,60,C\nvalue,0.3999189187,6,A\niter,M,74,B\nnames,0.3242075364,15,A\n"+ \ "any,0.9294471992,28,C\nother,0.503215279,90,A\nvalue,0.6827588778,96,A\nkeyword,0.5677838973,28,C\n" + \ "new,0.131201321,53,B\nnames,0.5928324999,10,B\noverwrite,0.7019303145,38,C\n" + \ "field,0.7117090842,71,C\ndefined,0.6661604231,60,A" kwargs = { 'num_columns': 4, 'dtypes': ['S', 'f', 'i', 'S'], 'classes': [ ['any', 'defined', 'field', 'iter', 'key', 'keyword', 'names', 'new', 'other', 'overwrite', 'value'], [], [], ['A', 'B', 'C'] ] } dataset = data_csv.GeneralDataset() dataset.load_from_lines(lines, with_output=False, **kwargs) result = np.array([ [ 4, 0.63989867, 60, 2], [10, 0.39991892, 6, 0], [ 6, 0.32420754, 15, 0], [ 0, 0.9294472, 28, 2], [ 8, 0.50321528, 90, 0], [10, 0.68275888, 96, 0], [ 5, 0.5677839, 28, 2], [ 7, 0.13120132, 53, 1], [ 6, 0.5928325, 10, 1], [ 9, 0.70193031, 38, 2], [ 2, 0.71170908, 71, 2], [ 1, 0.66616042, 60, 0] ]) assert np.allclose(dataset.data, result)
def test_csv_load_from_source_merge(monkeypatch, filename, filename_merge): def mock_S3Key_get(key): return filename_merge dataset = data_csv.GeneralDataset() params = dataset_params.copy() params['filters'] = [ {'name': 'outputs', 'columns': []}, {'name': 'merge', 'datas': [filename_merge]} ] dataset_file = dataset.load_from_source(filename, **params) dataset.load(dataset_file=dataset_file) assert dataset.data.shape == (14, 4)
def test_csv_load_from_source_merge_with_output(monkeypatch, filename, filename_merge): def mock_S3Key_get(key): return filename_merge monkeypatch.setattr(S3Key, 'get', mock_S3Key_get) dataset = data_csv.GeneralDataset() params = dataset_params.copy() params['filters'] = [ {'name': 'outputs', 'columns': ['2', '3']}, {'name': 'merge', 'datas': [filename_merge]} ] dataset_file = dataset.load_from_source(filename, **params) assert dataset.data.shape == (14, 2) assert dataset.output.shape == (14, 2)
def test_csv_load_from_source_ignore_permute(filename): dataset = data_csv.GeneralDataset() params = dataset_params.copy() params['filters'] = [{'name': 'ignore', 'columns': [1, 2]}, {'name': 'permute', 'columns': [3]}] dataset_file = dataset.load_from_source(filename, **params) dataset.load(dataset_file=dataset_file) result = np.array([ [ 0.40900001, 0., 0., 0., 1., 0., 0. ], [ 0.917, 1., 0., 0., 0., 0., 0. ], [ 0.27599999, 0., 1., 0., 0., 0., 0. ], [ 0.27399999, 0., 0., 0., 0., 0., 1. ], [ 22., 1., 0., 0., 0., 0., 0. ], [ 51., 1., 0., 0., 0., 0., 0. ], [ 69., 0., 0., 0., 0., 0., 0. ], [ 1., 1., 0., 0., 0., 0., 0. ] ]) assert np.allclose(dataset.data, result)
def test_csv_load_from_source_split(monkeypatch, filename): dataset = data_csv.GeneralDataset() params = dataset_params.copy() params['filters'] = [ {'name': 'outputs', 'columns': []}, {'name': 'split', 'start': 0, 'end': 80} ] dataset_file = dataset.load_from_source(filename, **params) dataset.load(dataset_file=dataset_file) assert dataset.data.shape == (6, 4) assert dataset.output is None params['filters'] = [ {'name': 'outputs', 'columns': []}, {'name': 'split', 'start': 80, 'end': 100} ] dataset_file = dataset.load_from_source(filename, **params) dataset.load(dataset_file=dataset_file) assert dataset.data.shape == (2, 4)
def test_csv_load_from_source_permute(monkeypatch, filename): dataset = data_csv.GeneralDataset() params = dataset_params.copy() params['filters'] = [ {'name': 'outputs', 'columns': []}, {'name': 'permute', 'columns': ['3']}, ] dataset_file = dataset.load_from_source(filename, **params) dataset.load(dataset_file=dataset_file) result = np.array([ [4.09000000e-01, 7.96000000e-01, 6.90000000e-01, 0., 0., 0., 1., 0., 0.], [9.17000000e-01, 9.11000000e-01, 5.32000000e-01, 1., 0., 0., 0., 0., 0.], [2.76000000e-01, 7.57000000e-01, 9.27000000e-01, 0., 1., 0., 0., 0., 0.], [2.74000000e-01, 5.50000000e-02, 6.64000000e-01, 0., 0., 0., 0., 0., 1.], [2.20000000e+01, 7.00000000e+00, 0.00000000e+00, 1., 0., 0., 0., 0., 0.], [5.10000000e+01, 4.80000000e+01, 3.80000000e+01, 1., 0., 0., 0., 0., 0.], [6.90000000e+01, 4.00000000e+01, 1.60000000e+01, 0., 0., 0., 0., 0., 0.], [1.00000000e+00, 2.80000000e+01, 9.80000000e+01, 1., 0., 0., 0., 0., 0.] ]) assert np.allclose(dataset.data, result)
def test_csv_load_from_source_normalize(filename): dataset = data_csv.GeneralDataset() params = dataset_params.copy() params['filters'] = [ {'name': 'outputs', 'columns': []}, {'name': 'normalize'}, ] dataset_file = dataset.load_from_source(filename, **params) dataset.load(dataset_file=dataset_file) result = np.array([ [ 1.96432206e-03, 1.54552087e-02, 7.04081636e-03, 6.66666687e-01], [ 9.35599301e-03, 1.78537909e-02, 5.42857125e-03, 1.66666672e-01], [ 2.91010674e-05, 1.46417767e-02, 9.45918355e-03, 3.33333343e-01], [ 0.00000000e+00, 0.00000000e+00, 6.77551003e-03, 1.00000000e+00], [ 3.16124916e-01, 1.44853473e-01, 0.00000000e+00, 1.66666672e-01], [ 7.38090396e-01, 1.00000000e+00, 3.87755096e-01, 1.66666672e-01], [ 1.00000000e+00, 8.33142161e-01, 1.63265303e-01, 0.00000000e+00], [ 1.05636874e-02, 5.82855344e-01, 1.00000000e+00, 1.66666672e-01] ]) assert np.allclose(dataset.data, result)
def test_csv_load_from_source_shuffle(filename): dataset = data_csv.GeneralDataset() params = dataset_params.copy() params['filters'] = [ {'name': 'outputs', 'columns': []}, {'name': 'shuffle'}, ] dataset_file = dataset.load_from_source(filename, **params) dataset.load(dataset_file=dataset_file) result = np.array([ [ 2.20000000e+01, 7.00000000e+00, 0.00000000e+00, 1.00000000e+00], [ 2.73999989e-01, 5.49999997e-02, 6.63999975e-01, 6.00000000e+00], [ 1.00000000e+00, 2.80000000e+01, 9.80000000e+01, 1.00000000e+00], [ 6.90000000e+01, 4.00000000e+01, 1.60000000e+01, 0.00000000e+00], [ 4.09000009e-01, 7.96000004e-01, 6.89999998e-01, 4.00000000e+00], [ 5.10000000e+01, 4.80000000e+01, 3.80000000e+01, 1.00000000e+00], [ 2.75999993e-01, 7.57000029e-01, 9.26999986e-01, 2.00000000e+00], [ 9.16999996e-01, 9.11000013e-01, 5.32000005e-01, 1.00000000e+00] ]) assert np.allclose(dataset.data, result)
def test_csv_load_from_source_text_data(filename_text): dataset = data_csv.GeneralDataset() params = dataset_text_params.copy() params['filters'] = [ {'name': 'outputs', 'columns': [4]}, ] dataset_file = dataset.load_from_source(filename_text, **params) dataset.load(dataset_file=dataset_file) # NB: LINES_TEXT[5, 1] is missing. As all text data columns are permuted, # it is substituted with all zeros in order not to affect training. result = np.array([ [ 2.00000000e+03, 0., 0., 0., 1., 0., 1.46851808e-01, 4.], [ 3.40000000e+03, 0., 0., 1., 0., 0., 8.65561426e-01, 1.], [ 1.20000000e+02, 0., 0., 0., 0., 1., 1.14752598e-01, 2.], [ 8.00000000e+02, 1., 0., 0., 0., 0., 2.78684437e-01, 6.], [ 2.10000000e+03, 0., 1., 0., 0., 0., 8.94018233e-01, 1.], [ 2.20000000e+03, 0., 0., 0., 0., 0., 7.81868279e-01, 1.], [ 2.00000000e+02, 0., 0., 1., 0., 0., 4.23042387e-01, 0.], [ 1.80000000e+04, 0., 1., 0., 0., 0., 8.16439629e-01, 1.] ]) assert np.allclose(dataset.data, result)
def test_csv_load_from_source_balance(monkeypatch, filename, filename_merge): dataset = data_csv.GeneralDataset() params = dataset_params.copy() params['filters'] = [ {'name': 'outputs', 'columns': []}, {'name': 'balance', 'sample': 'undersampling'} ] dataset_file = dataset.load_from_source(filename, **params) dataset.load(dataset_file=dataset_file) assert dataset.data.shape == (8, 4) # FAILS! # The output is of the shape (6, 3), while it has to be (3, 3) # There are 3 classes in the output: 0, 1, 9. 'Undersampling' strategy # implies reducing dataset to the size, where all classes presented by the # same number of examples as the class with the minimum number. In this case # it is class 9, that presented by only 1 example, thus there should be 3 lines # in the result dataset. # params = dataset_merge_params.copy() params['filters'] = [ {'name': 'outputs', 'columns': ['3']}, {'name': 'balance', 'sample': 'undersampling'} ] dataset_file = dataset.load_from_source(filename_merge, **params) dataset.load(dataset_file=dataset_file) assert dataset.data.shape == (3, 3) assert np.all(np.sort(dataset.output.T[0]) == np.array([0, 1, 9])) # FAILS # The result for oversampling strategy with no split filter should consist of # all the examples from the original dataset plus additional examples, randomly # picked from the original dataset so that the number of examples of each class # becomes equal with the maximum number of examples of a certain class. In the # example of filename_merge this class is 1, but the output consists of the # examples of ONLY class 1: # # [ 2. 29. 30.] [ 1 ] # [ 1. 7. 89.] [ 1 ] # [ 1. 7. 89.] [ 1 ] # [ 4. 11. 8.] [ 1 ] # [ 4. 11. 8.] [ 1 ] # # However it should look like this (doesn't have to be ordered the same way): # # [ 2. 29. 30.] [ 1 ] # [ 1. 7. 89.] [ 1 ] # [ 4. 11. 8.] [ 1 ] # [ 3. 92. 1.] [ 0 ] # [ 5. 50. 46.] [ 0 ] # [ 3. 92. 1.] [ 0 ] # [ 0. 10. 92.] [ 9 ] # [ 0. 10. 92.] [ 9 ] # [ 0. 10. 92.] [ 9 ] # params = dataset_merge_params.copy() params['filters'] = [ {'name': 'outputs', 'columns': ['3']}, {'name': 'balance', 'sample': 'oversampling'} ] dataset_file = dataset.load_from_source(filename_merge, **params) dataset.load(dataset_file=dataset_file) output = np.array([0, 0, 0, 1, 1, 1, 9, 9, 9]) assert dataset.data.shape == (9, 3) assert np.all(np.sort(dataset.output.T[0]) == output) # FAILS # The output is of the shape (5, 3), while it has to be (9, 3) # The same as above - it consists only of the examples of the most populated class 1 # In case of 'uniform' balancing it is impossible to predict what exactly examples # will be chosen for the result dataset, but nevertheless all classes should be presented. # params = dataset_merge_params.copy() params['filters'] = [ {'name': 'outputs', 'columns': ['3']}, {'name': 'balance', 'sample': 'uniform'} ] dataset_file = dataset.load_from_source(filename_merge, **params) dataset.load(dataset_file=dataset_file) assert dataset.data.shape == (9, 3)
def test_csv_load_from_lines_invalid(): lines = "x1,x2,x3,y\n30,37,67,50\n93,error,55,24\n90,68,72,59\n23,35,76,8" kwargs = {'num_columns': 4, 'dtypes': ['i', 'i', 'i', 'i']} dataset = data_csv.GeneralDataset() dataset.load_from_lines(lines, **kwargs) assert dataset.data.shape == (3, 4)
def test_csv_load_from_source(filename): dataset = data_csv.GeneralDataset() dataset_file = dataset.load_from_source(filename, **dataset_params) dataset.load(dataset_file=dataset_file) assert dataset.data.shape == (8, 4)
def test_csv_load_from_lines(): dataset = data_csv.GeneralDataset() dataset.load_from_lines(LINES) assert dataset.data.shape == (8, 4)