def test_structured_data_from_csv_partial_col_type_classifier(tmp_dir): clf = ak.StructuredDataClassifier( column_types=common.PARTIAL_COLUMN_TYPES_FROM_CSV, directory=tmp_dir, max_trials=1) clf.fit(x=common.TRAIN_FILE_PATH, y='survived', epochs=2, validation_data=common.TEST_FILE_PATH)
def test_structured_data_get_col_names_from_df(fit, tmp_path): clf = ak.StructuredDataClassifier( directory=tmp_path, seed=test_utils.SEED, ) clf.fit(x=test_utils.TRAIN_CSV_PATH, y="survived") assert nest.flatten(clf.inputs)[0].column_names[0] == "sex"
def test_structured_data_from_csv_classifier(tmp_dir): clf = ak.StructuredDataClassifier(directory=tmp_dir, max_trials=1) clf.fit(x=common.TRAIN_FILE_PATH, y='survived', epochs=2, validation_data=common.TEST_FILE_PATH) x_test = common.csv_test('classification') assert clf.predict(x_test).shape == (len(x_test), 1)
def test_raise_error_unknown_str_in_col_type(tmp_path): with pytest.raises(ValueError) as info: ak.StructuredDataClassifier( column_types={'age': 'num', 'parch': 'categorical'}, directory=tmp_path, seed=utils.SEED) assert 'Column_types should be either "categorical"' in str(info.value)
def test_structured_clf_fit_call_auto_model_fit(fit, tmp_path): auto_model = ak.StructuredDataClassifier(directory=tmp_path, seed=utils.SEED) auto_model.fit( x=utils.generate_structured_data(num_instances=100), y=utils.generate_one_hot_labels(num_instances=100, num_classes=3)) assert fit.is_called
def test_structured_data_from_numpy_classifier(tmp_dir): num_data = 500 data = common.structured_data(num_data) x_train = data y = np.random.randint(0, 3, num_data) y_train = y clf = ak.StructuredDataClassifier(directory=tmp_dir, max_trials=1) clf.fit(x_train, y_train, epochs=2, validation_data=(x_train, y_train))
def test_structured_clf_predict_csv_call_automodel_predict( predict, fit, tmp_path): auto_model = ak.StructuredDataClassifier(directory=tmp_path, seed=utils.SEED) auto_model.fit(x=utils.TRAIN_CSV_PATH, y="survived") auto_model.predict(x=utils.TEST_CSV_PATH) assert predict.is_called
def test_structured_clf_evaluate_call_automodel_evaluate( evaluate, fit, tmp_path): auto_model = ak.StructuredDataClassifier(directory=tmp_path, seed=utils.SEED) auto_model.fit(x=utils.TRAIN_CSV_PATH, y="survived") auto_model.evaluate(x=utils.TRAIN_CSV_PATH, y="survived") assert evaluate.is_called
def test_structured_data_classifier_transform_new_data(tmp_dir): num_data = 200 num_train = 100 data = common.structured_data(num_data) x_train, x_test = data[:num_train], data[num_train:] y = np.random.randint(0, 3, num_data) y_train, y_test = y[:num_train], y[num_train:] clf = ak.StructuredDataClassifier(directory=tmp_dir, max_trials=1) clf.fit(x_train, y_train, epochs=2, validation_data=(x_test, y_test))
def test_structured_data_from_csv_less_col_name_classifier(tmp_dir): with pytest.raises(ValueError) as info: clf = ak.StructuredDataClassifier( column_names=common.LESS_COLUMN_NAMES_FROM_CSV, directory=tmp_dir, max_trials=1) clf.fit(x=common.TRAIN_FILE_PATH, y='survived', epochs=2, validation_data=common.TEST_FILE_PATH) assert 'Expect column_names to have length' in str(info.value)
def test_structured_data_from_csv_false_col_type_classifier(tmp_dir): with pytest.raises(ValueError) as info: clf = ak.StructuredDataClassifier( column_types=common.FALSE_COLUMN_TYPES_FROM_CSV, directory=tmp_dir, max_trials=1) clf.fit(x=common.TRAIN_FILE_PATH, y='survived', epochs=2, validation_data=common.TEST_FILE_PATH) assert 'Column_types should be either "categorical"' in str(info.value)
def build_pipeline(self): """ Makes a pipeline based on data_config """ if self.problem_type == "classification": automl_pipeline = ak.StructuredDataClassifier(**self.automl_settings) elif self.problem_type == "regression": automl_pipeline = ak.StructuredDataRegressor(**self.automl_settings) return automl_pipeline
def test_structured_data_from_csv_col_type_mismatch_classifier(tmp_dir): with pytest.raises(ValueError) as info: clf = ak.StructuredDataClassifier( column_types=common.COLUMN_TYPES_FROM_CSV, directory=tmp_dir, max_trials=1) clf.fit(x=common.TRAIN_FILE_PATH, y='survived', epochs=2, validation_data=common.TEST_FILE_PATH) assert 'Column_names and column_types are mismatched.' in str(info.value)
def test_structured_data_from_numpy_classifier(tmp_dir): num_data = 500 num_train = 400 data = common.structured_data(num_data) x_train, x_test = data[:num_train], data[num_train:] y = np.random.randint(0, 3, num_data) y_train, y_test = y[:num_train], y[num_train:] clf = ak.StructuredDataClassifier(directory=tmp_dir, max_trials=1) clf.fit(x_train, y_train, epochs=2, validation_data=(x_train, y_train)) assert clf.predict(x_test).shape == (len(y_test), 1)
def test_structured_clf_fit_call_auto_model_fit(fit, tmp_path): auto_model = ak.StructuredDataClassifier(directory=tmp_path, seed=utils.SEED) auto_model.fit( x=pd.read_csv(utils.TRAIN_CSV_PATH).to_numpy().astype( np.unicode)[:100], y=utils.generate_one_hot_labels(num_instances=100, num_classes=3), ) assert fit.is_called
def test_structured_data_from_numpy_col_name_classifier(tmp_dir): num_data = 500 data = common.structured_data(num_data) x_train = data y = np.random.randint(0, 3, num_data) y_train = y clf = ak.StructuredDataClassifier( column_names=common.COLUMN_NAMES_FROM_NUMPY, directory=tmp_dir, max_trials=1) clf.fit(x_train, y_train, epochs=2, validation_data=(x_train, y_train))
def test_structured_data_clf_convert_csv_to_df_and_np(fit, tmp_path): auto_model = ak.StructuredDataClassifier(directory=tmp_path, seed=utils.SEED) auto_model.fit(x=utils.TRAIN_FILE_PATH, y='survived', epochs=2, validation_data=(utils.TEST_FILE_PATH, 'survived')) _, kwargs = fit.call_args_list[0] assert isinstance(kwargs['x'], pandas.DataFrame) assert isinstance(kwargs['y'], np.ndarray)
def test_titaninc_accuracy_over_77(tmp_path): TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv" TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv" train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL) test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL) clf = ak.StructuredDataClassifier(max_trials=10, directory=tmp_path) clf.fit(train_file_path, "survived") accuracy = clf.evaluate(test_file_path, "survived")[1] assert accuracy >= 0.77
def test_raise_error_unknown_str_in_col_type(tmp_path): with pytest.raises(ValueError) as info: ak.StructuredDataClassifier( column_types={ "age": "num", "parch": "categorical" }, directory=tmp_path, seed=test_utils.SEED, ) assert 'column_types should be either "categorical"' in str(info.value)
def test_structured_data_classifier_from_csv(init, fit): clf = ak.StructuredDataClassifier(directory=tmp_dir, max_trials=1, seed=common.SEED) clf.fit(x=common.TRAIN_FILE_PATH, y='survived', epochs=2, validation_data=common.TEST_FILE_PATH) assert init.called _, kwargs = fit.call_args_list[0] assert isinstance(kwargs['x'], pandas.DataFrame) assert isinstance(kwargs['y'], np.ndarray)
def main(): train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL) test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL) clf = ak.StructuredDataClassifier(max_trials=10, directory='tmp_dir', overwrite=True) start_time = timeit.default_timer() clf.fit(train_file_path, 'survived') stop_time = timeit.default_timer() accuracy = clf.evaluate(test_file_path, 'survived')[1] print('Accuracy: {accuracy}%'.format(accuracy=round(accuracy * 100, 2))) print('Total time: {time} seconds.'.format(time=round(stop_time - start_time, 2)))
def test_structured_data_from_numpy_classifier(tmp_dir): num_data = 500 num_train = 400 data = common.generate_structured_data(num_data) x_train, x_test = data[:num_train], data[num_train:] y = common.generate_one_hot_labels(num_instances=num_data, num_classes=3) y_train, y_test = y[:num_train], y[num_train:] clf = ak.StructuredDataClassifier(directory=tmp_dir, max_trials=1, seed=common.SEED) clf.fit(x_train, y_train, epochs=2, validation_data=(x_train, y_train)) assert clf.predict(x_test).shape == (len(y_test), 3)
def test_structured_data_col_type_no_name_error(tmp_path): with pytest.raises(ValueError) as info: clf = ak.StructuredDataClassifier( column_types={ "age": "numerical", "parch": "categorical" }, directory=tmp_path, seed=test_utils.SEED, ) clf.fit(x=np.random.rand(100, 30), y=np.random.rand(100, 1)) assert "column_names must be specified" in str(info.value)
def test_raise_error_unknown_name_in_col_type(tmp_path): with pytest.raises(ValueError) as info: ak.StructuredDataClassifier( column_types={ "age": "numerical", "parch": "categorical" }, column_names=["age", "fare"], directory=tmp_path, seed=utils.SEED, ) assert "Column_names and column_types are mismatched" in str(info.value)
def test_structured_data_from_numpy_col_type_classifier(tmp_dir): num_data = 500 data = common.structured_data(num_data) x_train = data y = np.random.randint(0, 3, num_data) y_train = y with pytest.raises(ValueError) as info: clf = ak.StructuredDataClassifier( column_types=common.COLUMN_TYPES_FROM_NUMPY, directory=tmp_dir, max_trials=1) clf.fit(x_train, y_train, epochs=2, validation_data=(x_train, y_train)) assert str(info.value) == 'Column names must be specified.'
def test_structured_data_classifier(tmp_path): num_data = 500 num_train = 400 data = pd.read_csv(utils.TRAIN_CSV_PATH).to_numpy().astype(np.unicode)[:num_data] x_train, x_test = data[:num_train], data[num_train:] y = utils.generate_one_hot_labels(num_instances=num_data, num_classes=3) y_train, y_test = y[:num_train], y[num_train:] clf = ak.StructuredDataClassifier( directory=tmp_path, max_trials=1, seed=utils.SEED ) clf.fit(x_train, y_train, epochs=2, validation_data=(x_train, y_train)) clf.export_model() assert clf.predict(x_test).shape == (len(y_test), 3)
def test_structured_classifier(init, fit): num_data = 500 train_x = common.generate_structured_data(num_data) train_y = common.generate_one_hot_labels(num_instances=num_data, num_classes=3) clf = ak.StructuredDataClassifier( column_names=common.COLUMN_NAMES_FROM_NUMPY, directory=tmp_dir, max_trials=1, seed=common.SEED) clf.fit(train_x, train_y, epochs=2, validation_data=(train_x, train_y)) assert init.called assert fit.called
def test_structured_data_input_name_type_mismatch_error(tmp_path): with pytest.raises(ValueError) as info: clf = ak.StructuredDataClassifier( column_types={ "_age": "numerical", "parch": "categorical" }, column_names=["age", "fare"], directory=tmp_path, seed=test_utils.SEED, ) clf.fit(x=test_utils.TRAIN_CSV_PATH, y="survived") assert "column_names and column_types are mismatched." in str(info.value)
def test_structured_data_clf_convert_csv_to_df_and_np(fit, tmp_path): auto_model = ak.StructuredDataClassifier(directory=tmp_path, seed=test_utils.SEED) auto_model.fit( x=test_utils.TRAIN_CSV_PATH, y="survived", epochs=2, validation_data=(test_utils.TEST_CSV_PATH, "survived"), ) _, kwargs = fit.call_args_list[0] assert isinstance(kwargs["x"], pd.DataFrame) assert isinstance(kwargs["y"], np.ndarray)
def train_autokeras(X, Y, x, y, modelfile, max_trials=10, epochs=600): from sklearn import metrics import autokeras as ak from sklearn.preprocessing import MinMaxScaler clf = ak.StructuredDataClassifier(overwrite=True, max_trials=max_trials) clf.fit(X, Y, validation_data=(x, y), epochs=epochs) model = clf.export_model() model.save(modelfile) akpred = clf.predict(x) acc = getaccuracy(akpred, y) return acc, model