def test_read_csv_data_split_object(): X, y = read_csv_data(LARGE_DATA_4MB, target_column_index=0, output_dtype="O") yX = read_csv_data(LARGE_DATA_4MB, output_dtype="O") assert X.shape == (38223, 20) assert y.shape == (38223,) assert np.array_equal(np.hstack((y.reshape(-1, 1), X)), yX) assert X.dtype.kind == "O" assert y.dtype.kind == "O"
def test_excel_dialect(csv_data_dir): """Test that read_csv_data function properly reads files in the excel dialect.""" generated_contents = read_csv_data(source=csv_data_dir + "/file_1.csv") assert generated_contents.shape == (len(csv1), len(csv1[0])) assert np.all(generated_contents == np.array( [[str(v) for v in row] for row in csv1], dtype=np.str))
def test_directory_content(csv_data_dir): """Test that read_csv_data function reads content correctly from a directory""" generated_contents = read_csv_data(source=csv_data_dir) correct_array = csv1 + csv2 assert generated_contents.shape == (len(correct_array), len(correct_array[0])) assert np.all(generated_contents == np.array( [[str(v) for v in row] for row in correct_array], dtype=np.str))
def test_read_csv_data(data_file, shape): """Test for reading individual csv data files""" array = read_csv_data(source=data_file, batch_size=1, fit_memory_percent=100.0, output_dtype="U") assert array.shape == shape assert array.dtype.kind in {"U", "S"}
def test_read_csv_data_inmemory_mode(): """Test to make sure 'InMemory' mode reads in content correctly""" generated_contents = read_csv_data(source=BUFFER_DATA.encode()) correct_array = [] for i in range(8): correct_array.append([i * 4 + j for j in range(1, 5)]) assert generated_contents.shape == (len(correct_array), len(correct_array[0])) assert np.all(generated_contents == np.array([[str(v) for v in row] for row in correct_array], dtype=np.str))
def test_read_csv_data_split_limited(): total_memory_in_bytes = psutil.virtual_memory().total two_mb_in_bytes = _convert_megabytes_to_bytes(2) fraction_of_memory_to_use = two_mb_in_bytes / total_memory_in_bytes X, y = read_csv_data( LARGE_DATA_4MB, target_column_index=0, fit_memory_percent=fraction_of_memory_to_use * 100, output_dtype="U" ) assert _convert_megabytes_to_bytes(1.9) < (X.nbytes + y.nbytes) <= two_mb_in_bytes assert X.dtype.kind == "U" assert y.dtype.kind == "U"
def test_read_csv_data_samples(): """Test for sample case where the entire dataset doesn't fit into the available memory""" total_memory_in_bytes = psutil.virtual_memory().total two_mb_in_bytes = _convert_megabytes_to_bytes(2) fraction_of_memory_to_use = two_mb_in_bytes / total_memory_in_bytes sample_data = read_csv_data( source=LARGE_DATA_4MB, fit_memory_percent=fraction_of_memory_to_use * 100, output_dtype="U" ) assert sample_data.dtype.kind == "U" assert _convert_megabytes_to_bytes(1.9) < sample_data.nbytes <= two_mb_in_bytes
def test_read_csv_data_split_limited_object(): total_memory_in_bytes = psutil.virtual_memory().total two_mb_in_bytes = _convert_megabytes_to_bytes(2) fraction_of_memory_to_use = two_mb_in_bytes / total_memory_in_bytes X, y = read_csv_data( LARGE_DATA_4MB, target_column_index=0, fit_memory_percent=fraction_of_memory_to_use * 100, output_dtype="O" ) arrays_memory = _get_size_total(X) + _get_size_total(y) assert _convert_megabytes_to_bytes(1.9) < arrays_memory <= two_mb_in_bytes assert X.dtype.kind == "O" assert y.dtype.kind == "O"
def test_read_csv_data_sample_append(): """Test for reading data in chunks.""" array = read_csv_data(source=LARGE_DATA_4MB, fit_memory_percent=100.0) assert array.shape == (38223, 21)
def test_read_csv_data_directory(): """Test for reading from a directory of data""" array = read_csv_data(source="test/data/csv/mock_datasplitter_output", fit_memory_percent=100.0) assert array.shape == (22, 4)
def test_read_csv_data_invalid_csv(): with pytest.raises(InvalidInstanceError): read_csv_data(source="test/data/csv/invalid.csv")
def test_read_empty_buffer(): """Test for getting an empty array if the buffer is empty""" generated_contents = read_csv_data(source="".encode()) assert generated_contents.size == 0
# load feature processor from processor module feature_transformer = processor.build_feature_transform() # customize global feature transform step feature_transformer = update_feature_transformer(header, feature_transformer) # load label processor from processor module # absence of label processor implies that the labels are not processed try: label_transformer = processor.build_label_transform() except AttributeError: label_transformer = None X, y = read_csv_data(source=args.data_dir, target_column_index=header.target_column_index, output_dtype='O') logging.info('Feature data shape: {}'.format(X.shape)) model = train(X, y, header=header, feature_transformer=feature_transformer, label_transformer=label_transformer) # serialize the model to model_dir dump(model, filename=os.path.join(args.model_dir, 'model.joblib')) # serialize the inference code to the model/code. serialize_code(os.path.join(args.model_dir, 'code'), processor.__file__)