def data_regression( self, column_names: list = None, column_types: dict = None, output_dim: int = None, **kwargs, ) -> ak.StructuredDataRegressor: """Data Regression. Args: column_names (list, optional): Name of the columns. Defaults to None. column_types (dict, optional): Type of the columns. Defaults to None. output_dim (int, optional): Number of output dimensions. Defaults to None. Returns: ak.StructuredDataRegressor: AutoKERAS data regression class. """ return ak.StructuredDataRegressor( column_names=column_names, column_types=column_types, output_dim=output_dim, loss=self.loss, metrics=self.metrics, project_name=self.project_name, max_trials=self.max_trials, directory=self.directory, objective=self.objective, tuner=self.tuner, overwrite=self.overwrite, seed=self.seed, max_model_size=self.max_model_size, **kwargs, )
def test_structured_reg_fit_call_auto_model_fit(fit, tmp_path): auto_model = ak.StructuredDataRegressor(directory=tmp_path, seed=utils.SEED) auto_model.fit(x=utils.generate_structured_data(num_instances=100), y=utils.generate_data(num_instances=100, shape=(1, ))) assert fit.is_called
def test_structured_data_from_numpy_regressor(tmp_dir): num_data = 500 data = common.structured_data(num_data) x_train = data y = np.random.rand(num_data, 1) y_train = y clf = ak.StructuredDataRegressor(directory=tmp_dir, max_trials=1) clf.fit(x_train, y_train, epochs=2, validation_data=(x_train, y_train))
def test_structured_data_from_csv_regressor(tmp_dir): clf = ak.StructuredDataRegressor(directory=tmp_dir, max_trials=1) clf.fit(x=common.TRAIN_FILE_PATH, y='fare', epochs=2, validation_data=common.TEST_FILE_PATH) x_test = common.csv_test('regression') assert clf.predict(x_test).shape == (len(x_test), 1)
def run(): import numpy as np import autokeras as ak from sklearn.model_selection import train_test_split import pickle from tensorflow.keras.utils import plot_model from tensorflow.keras.models import model_from_json data = s.open_data() score = s.open_score() print('pn, c1, c2, c3.\nSelect!') flag = True while flag: print('study_type = ', end='', flush=True) study_type = input() if study_type == 'pn': flag = False elif study_type == 'c1': data = s.c1(data) flag = False elif study_type == 'c2': data = s.c2(data) flag = False elif study_type == 'c3': data = s.c3(data) flag = False else: flag = True score = score / 100 x_train,x_test,y_train,y_test = train_test_split(data, score, test_size=0.5) reg = ak.StructuredDataRegressor(max_trials=3) reg.fit(x_train, y_train, epochs=5) eva = reg.evaluate(x_test, y_test) eva_name = './result/' + study_type + '_3.txt' np.savetxt(eva_name, eva) model = reg.export_model() # json_string = model.to_json() # json_name = './result/' + study_type + '_3.json' # with open(json_name, 'w', encoding='utf-8') as f: # f.write(json_string) model_name = './result/' + study_type + '_3.h5' model.save(model_name, save_format='tf') pdf_name = './result/' + study_type + '_3.pdf' plot_model(model, to_file=pdf_name) # weights_name = study_type + '.hdf5' # model.save_weights(weights_name) return model, x_test, y_test
def build_pipeline(self): """ Makes a pipeline based on data_config """ if self.problem_type == "classification": automl_pipeline = ak.StructuredDataClassifier(**self.automl_settings) elif self.problem_type == "regression": automl_pipeline = ak.StructuredDataRegressor(**self.automl_settings) return automl_pipeline
def test_structured_data_from_numpy_regressor(tmp_dir): num_data = 500 num_train = 400 data = common.structured_data(num_data) x_train, x_test = data[:num_train], data[num_train:] y = np.random.rand(num_data, 1) y_train, y_test = y[:num_train], y[num_train:] clf = ak.StructuredDataRegressor(directory=tmp_dir, max_trials=1) clf.fit(x_train, y_train, epochs=2, validation_data=(x_train, y_train)) assert clf.predict(x_test).shape == (len(y_test), 1)
def test_structured_reg_fit_call_auto_model_fit(fit, tmp_path): auto_model = ak.StructuredDataRegressor(directory=tmp_path, seed=utils.SEED) auto_model.fit( x=pd.read_csv(utils.TRAIN_CSV_PATH).to_numpy().astype( np.unicode)[:100], y=utils.generate_data(num_instances=100, shape=(1, )), ) assert fit.is_called
def test_structured_data_from_numpy_regressor(tmp_dir): num_data = 500 num_train = 400 data = common.generate_structured_data(num_data) x_train, x_test = data[:num_train], data[num_train:] y = common.generate_data(num_instances=num_data, shape=(1, )) y_train, y_test = y[:num_train], y[num_train:] clf = ak.StructuredDataRegressor(directory=tmp_dir, max_trials=1, seed=common.SEED) clf.fit(x_train, y_train, epochs=2, validation_data=(x_train, y_train)) assert clf.predict(x_test).shape == (len(y_test), 1)
def run_example(self): # Initialize the classifier. regressor = ak.StructuredDataRegressor(max_trials=10, loss="mean_absolute_error") # x is the path to the csv file. y is the column name of the column to predict. regressor.fit(x='./data/churn-train.csv', y='churn_probability') # Evaluate the accuracy of the found model. print('MAE: {mae}'.format(mae=regressor.evaluate( x='./data/churn-test.csv', y='churn_probability')))
def test_structured_data_from_numpy_regressor(tmp_path): num_data = 500 num_train = 400 data = utils.generate_data(num_data, shape=(10, )) x_train, x_test = data[:num_train], data[num_train:] y = utils.generate_data(num_instances=num_data, shape=(1, )) y_train, y_test = y[:num_train], y[num_train:] clf = ak.StructuredDataRegressor(directory=tmp_path, max_trials=2, seed=utils.SEED) clf.fit(x_train, y_train, epochs=20, validation_data=(x_train, y_train)) clf.export_model() assert clf.predict(x_test).shape == (len(y_test), 1)
def test_structured_data_regressor(tmp_path): num_data = 500 num_train = 400 data = pd.read_csv(utils.TRAIN_CSV_PATH).to_numpy().astype(np.unicode)[:num_data] x_train, x_test = data[:num_train], data[num_train:] y = utils.generate_data(num_instances=num_data, shape=tuple()) y_train, y_test = y[:num_train], y[num_train:] clf = ak.StructuredDataRegressor( directory=tmp_path, max_trials=2, seed=utils.SEED ) clf.fit(x_train, y_train, epochs=11, validation_data=(x_train, y_train)) clf.export_model() assert clf.predict(x_test).shape == (len(y_test), 1)
def test_structured_regressor(init, fit): num_data = 500 train_x = common.generate_structured_data(num_data) train_y = common.generate_data(num_instances=100, shape=(1,)) clf = ak.StructuredDataRegressor( column_names=common.COLUMN_NAMES_FROM_NUMPY, directory=tmp_dir, max_trials=1, seed=common.SEED) clf.fit(train_x, train_y, epochs=2, validation_data=(train_x, train_y)) assert init.called assert fit.called
def build_model(self) -> ak.AutoModel: model = None if self.data_type == 'image': if self.task_type == 'regression': model = ak.ImageRegressor() elif self.task_type == 'classification': model = ak.ImageClassifier() elif self.data_type == 'text': if self.task_type == 'regression': model = ak.TextRegressor() elif self.task_type == 'classification': model = ak.TextRegressor() elif self.data_type == 'csv': if self.task_type == 'regression': model = ak.StructuredDataRegressor() elif self.task_type == 'classification': model = ak.StructuredDataClassifier() return model
def test_sd_reg_init_hp0_equals_hp_of_a_model(tmp_path): clf = ak.StructuredDataRegressor( directory=tmp_path, column_names=["a", "b"], column_types={ "a": "numerical", "b": "numerical" }, ) clf.inputs[0].shape = (2, ) clf.outputs[0].in_blocks[0].output_shape = (10, ) init_hp = task_specific.STRUCTURED_DATA_REGRESSOR[0] hp = kerastuner.HyperParameters() hp.values = copy.copy(init_hp) clf.tuner.hypermodel.build(hp) assert set(init_hp.keys()) == set(hp._hps.keys())
def main(): house_dataset = fetch_california_housing() data = house_dataset.data target = np.array(house_dataset.target) x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42) clf = ak.StructuredDataRegressor(max_trials=10, directory='tmp_dir', overwrite=True) start_time = timeit.default_timer() clf.fit(x_train, y_train) stop_time = timeit.default_timer() mse = clf.evaluate(x_test, y_test)[1] print('RMSE: {rmse}'.format(mse=round(math.sqrt(mse), 2))) print('Total time: {time} seconds.'.format( time=round(stop_time - start_time, 2)))
def train_autokeras(X_train, X_test, y_train, y_test, mtype, common_name_model, problemtype, classes, default_featurenames, transform_model, settings, model_session): # create file names files = list() model_name = common_name_model + '.pickle' # remove folder if it exists if mtype == 'c': if 'structured_data_classifier' in os.listdir(): shutil.rmtree('structured_data_classifier') model = ak.StructuredDataClassifier(max_trials=100) model.fit(X_train, y_train) files.append('structured_data_classifier') elif mtype == 'r': if 'structured_data_regressor' in os.listdir(): shutil.rmtree('structured_data_regressor') model = ak.StructuredDataRegressor(max_trials=100) model.fit(X_train, y_train) files.append('structured_data_regressor') # show predictions predictions = model.predict(X_test).flatten() print(predictions) # pickle the model picklefile = open(common_name_model + '.pickle', 'wb') pickle.dump(model, picklefile) picklefile.close() # get variables files.append(model_name) model_dir = os.getcwd() return model_name, model_dir, files
def main(): # paths # path to time series time_series_path = '../output/dataframes/BTCUSDT/BTCUSDT_m_15.csv' # output paths model_dir = '../output/models/model' model_path = f'{model_dir}/model.h5' # data settings # number of candles to use as features feature_length = 50 # number of candles to predict output_length = 1 # share of training data from all samples train_size = 0.8 # label function label = calculate_next_closing_price # model settings # number of models to test max_trials = 300 # load data sets print('Loading data sets...') x_train, x_test, y_train, y_test = get_datasets(time_series_path, feature_length, output_length, train_size, label) # normalize data x_train, mean, std = normalize_data(x_train) # store mean and std np.save(f'{model_dir}/mean', mean) np.save(f'{model_dir}/std', std) x_test = normalize_data(x_test, mean, std)[0] # flatten features x_train = np.array([x_train[i].flatten() for i in range(len(x_train))]) x_test = np.array([x_test[i].flatten() for i in range(len(x_test))]) start_time = timer() # get model if label == calculate_up_down_label: search = ak.StructuredDataClassifier(max_trials=max_trials, overwrite=True, loss='accuracy') else: search = ak.StructuredDataRegressor(max_trials=max_trials, overwrite=True, metrics=['mean_absolute_error']) search.fit(x=x_train, y=y_train, validation_data=[x_test, y_test]) print(f'Done getting model after {timer() - start_time}s!') model = search.export_model() model.summary() print(f'Evaluation: {model.evaluate(x_test, y_test)}') model.save(model_path) print(f'Model saved in {model_path}')
def test_structured_data_from_csv_regressor(tmp_dir): clf = ak.StructuredDataRegressor(directory=tmp_dir, max_trials=1) clf.fit(x=common.TRAIN_FILE_PATH, y='fare', epochs=2, validation_data=common.TEST_FILE_PATH)
x_train = x_train[:40] x_test = x_test[:40] y_train = y_train[:40] y_test = y_test[:40] x_train = x_train.reshape(40, 13).astype('float32') / 255. x_test = x_test.reshape(40, 13).astype('float32') / 255. from tensorflow.keras.utils import to_categorical y_train = to_categorical(y_train) y_test = to_categorical(y_test) model = ak.StructuredDataRegressor( overwrite=False, max_trials=1, loss='mse', metrics=['mae'], ) from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint from tensorflow.keras.callbacks import ReduceLROnPlateau es = EarlyStopping(monitor='val_loss', mode='min', patience=6) lr = ReduceLROnPlateau(monitor='val_loss', patience=3, factor=0.5, verbose=2) ck = ModelCheckpoint('C:/data/modelcheckpoint', save_weights_only=True, save_best_onlT=True, monitor='val_loss', verbose=1) model.fit(x_train, y_train,
columns=house_dataset.feature_names + ["Price"], ) train_size = int(df.shape[0] * 0.9) df[:train_size].to_csv("train.csv", index=False) df[train_size:].to_csv("eval.csv", index=False) train_file_path = "train.csv" test_file_path = "eval.csv" """ The second step is to run the [StructuredDataRegressor](/structured_data_regressor). As a quick demo, we set epochs to 10. You can also leave the epochs unspecified for an adaptive number of epochs. """ # Initialize the structured data regressor. reg = ak.StructuredDataRegressor(overwrite=True, max_trials=3) # It tries 3 different models. # Feed the structured data regressor with training data. reg.fit( # The path to the train.csv file. train_file_path, # The name of the label column. "Price", epochs=10, ) # Predict with the best model. predicted_y = reg.predict(test_file_path) # Evaluate the best model with testing data. print(reg.evaluate(test_file_path, "Price")) """ ## Data Format The AutoKeras StructuredDataRegressor is quite flexible for the data format.
train_size = int(df.shape[0] * 0.9) df[:train_size].to_csv('train.csv', index=False) df[train_size:].to_csv('eval.csv', index=False) train_file_path = 'train.csv' test_file_path = 'eval.csv' """ The second step is to run the [StructuredDataRegressor](/structured_data_regressor). As a quick demo, we set epochs to 10. You can also leave the epochs unspecified for an adaptive number of epochs. """ # Initialize the structured data regressor. reg = ak.StructuredDataRegressor( overwrite=True, max_trials=3) # It tries 10 different models. # Feed the structured data regressor with training data. reg.fit( # The path to the train.csv file. train_file_path, # The name of the label column. 'Price', epochs=10) # Predict with the best model. predicted_y = reg.predict(test_file_path) # Evaluate the best model with testing data. print(reg.evaluate(test_file_path, 'Price')) """ ## Data Format
values = values.astype('float32') # Normalized scaler = MinMaxScaler(feature_range=(0, 1)) scaled = scaler.fit_transform(values) scaler_Y = MinMaxScaler(feature_range=(0, 1)) # Sliding window reframed = series_to_supervised(data=scaled, col_names=dataset.columns, n_in=18, n_out=1) n_train_hours = int(reframed.shape[0] * 0.8) train = reframed.iloc[:n_train_hours, :] valid = reframed.iloc[n_train_hours:16779, :] column_names = reframed.columns column_names = column_names.drop('Energy(t)') data_type = (len(column_names)) * ['numerical'] data_type = dict(zip(column_names, data_type)) # IPython.embed() regressor = ak.StructuredDataRegressor(max_trials=2, column_types=data_type) regressor.fit(x=train.drop(columns=['Energy(t)']), y=train['Energy(t)']) IPython.embed()
# # delete # dataset.drop(features_drop, axis=1, inplace=True) # # save to csv # df_train.to_csv("df_train.csv", mode='w') # df_test.to_csv("df_test.csv", mode='w') for dataset in train_test_data: dataset.drop('id', axis=1, inplace=True) df_train.to_csv("df_train.csv", mode='w') df_test.to_csv("df_test.csv", mode='w') # # It tries n different models. # clf = ak.StructuredDataClassifier(max_trials=100) # # Feed the structured data classifier with training data. # train_y = df_train.pop('price') # clf.fit(x=df_train, y=train_y) # It tries n different models. clf = ak.StructuredDataRegressor() # Feed the structured data classifier with training data. train_y = df_train.pop('price') clf.fit(x=df_train, y=train_y) preds = clf.predict(df_test) df_pred = pd.read_csv("./data/sample_submission.csv") df_pred['price'] = preds df_pred.to_csv("./houseSubmission.csv", index=None)
import numpy as np import tensorflow as tf from tensorflow.keras.layers import Conv2D, Dropout import autokeras as ak from tensorflow.keras.datasets import boston_housing (x_train, y_train), (x_test, y_test) = boston_housing.load_data() print(x_train.shape, y_train.shape) #(404, 13) (404,) print(x_test.shape, y_test.shape) #(102, 13) (102,) model = ak.StructuredDataRegressor(overwrite=True, max_trials=3) model.fit(x_train, y_train, epochs=10, validation_split=0.2) results = model.evaluate(x_test, y_test) model2 = model.export_model() best_model = model.tuner.get_best_model() best_model2.save('C:/data/h5/best_boston.h5') # best_model = load_model('C:/data/h5/best_boston.h5') # results = best_model.evaluate(x_test, y_test) # print('results: ', results) # best_model.summary()
from sklearn.datasets import load_boston dataset = load_boston() x = dataset.data y = dataset.target from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.8, shuffle=True, random_state=42) #1. 데이터 / 전처리 from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) model = ak.StructuredDataRegressor(loss='mse',metrics=['mae'],max_trials=2, overwrite=True) from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint es = EarlyStopping(patience=20, verbose=1) lr = ReduceLROnPlateau(factor=0.5, patience=10, verbose=1) model.fit(x_train, y_train, epochs=300, validation_split=0.2, callbacks=[es,lr]) results = model.evaluate(x_test, y_test) from sklearn.metrics import r2_score y_pred = model.predict(x_test) r2 = r2_score(y_test, y_pred) model2 = model.export_model() try: model2.save('ak_save_boston', save_format='tf')
def run_example(self, train_path, test_path, target): metrics = {} train = pd.read_csv(train_path) # Auto-keras regressor = ak.StructuredDataRegressor(max_trials=10, loss="mean_absolute_error") regressor.fit(x=train, y=target) metrics["auto-keras"] = regressor.evaluate(x=train, y=target)[0] # Auto-gluon train_data = task.Dataset(file_path=train_path) label_column = target predictor = task.fit(train_data=train_data, label=label_column, eval_metric="mean_absolute_error") test_data = task.Dataset(file_path=test_path) y_test = test_data[label_column] # values to predict # delete label column to prove we're not cheating test_data_nolab = test_data.drop(labels=[label_column], axis=1) y_pred = predictor.predict(test_data_nolab) metrics["auto-gluon"] = predictor.evaluate_predictions( y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)["mean_absolute_error"] # Auto-sklearn categorical_feature_mask = train.dtypes == object categorical_cols = train.columns[categorical_feature_mask].tolist() le = LabelEncoder() train[categorical_cols] = train[categorical_cols].apply( lambda col: le.fit_transform(col)) X_train = train.drop(columns=[target]).to_numpy() y_train = train[target].to_numpy() test = pd.read_csv(test_path) test[categorical_cols] = test[categorical_cols].apply( lambda col: le.fit_transform(col)) X_test = test.drop(columns=[target]).to_numpy() y_test = test[target].to_numpy() automl = autosklearn.regression.AutoSklearnRegressor( time_left_for_this_task=120, per_run_time_limit=30, resampling_strategy='cv', resampling_strategy_arguments={'folds': 5}, ) automl.fit(X_train.copy(), y_train.copy(), metric=autosklearn.metrics.mean_absolute_error) automl.refit(X_train.copy(), y_train.copy()) predictions = automl.predict(X_test) metrics["auto-sklearn"] = sklearn.metrics.mean_absolute_error( y_test, predictions) # H2O AutoML h2o.init() train = h2o.import_file(train_path) test = h2o.import_file(test_path) x = train.columns y = target x.remove(y) aml = H2OAutoML(max_runtime_secs=20, seed=1, sort_metric="mae") aml.train(x=x, y=y, training_frame=train) metrics["h2o-automl"] = aml.leader.model_performance(test).mae() h2o.shutdown() # TPOT tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=42, scoring='neg_mean_absolute_error', cv=5) tpot.fit(X_train, y_train) metrics["tpot"] = -tpot.score(X_test, y_test) best_metric = float("inf") best_model = "MODEL" for metric in metrics: if metrics[metric] < best_metric: best_metric = metrics[metric] best_model = metric print("THE BEST AUTOML TOOL IS " + str(best_model) + ", WITH A MAE OF " + str(best_metric) + " ACHIEVED BY THE BEST MODEL.") return metrics
def prepare_and_test(X, y, task, timelife): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) if isinstance(y_train, pd.Series): y_train = y_train.to_frame() if isinstance(y_test, pd.Series): y_test = y_test.to_frame() print(y) #print(X_train) #print(y_train, 'vediamoooooo: y_train ' + str(len(np.unique(y_train)))) #print(y_test, 'vediamoooooo: y_test ' + str(len(np.unique(y_test)))) if (task == 'classification'): clf = ak.StructuredDataClassifier(overwrite=True, max_trials=3, metrics=['accuracy', f1_score]) custom_obj = {'f1_score': f1_score} else: clf = ak.StructuredDataRegressor( overwrite=True, max_trials=3, metrics=['mean_squared_error', r2_score]) custom_obj = {'r2_score': r2_score} clf.fit(X_train, y_train, validation_split=0.15, epochs=timelife) model = clf.export_model(custom_objects=custom_obj) model.summary() summary = [] model.summary(print_fn=lambda x: summary.append(x)) model_summary = '\n'.join(summary) y_pred = clf.predict(X_test, custom_objects=custom_obj) #le = LabelEncoder() # forse è meglio che tolga il tutto relativo al label encoder #print('non so cosa sto facendo ', accuracy_score(y_test, y_pred)) #y_test = le.fit_transform(y_test).to_numpy() #y_pred = le.fit_transform(y_pred).to_numpy() y_test = y_test.to_numpy() print((y_test)) print((y_pred)) #print(clf.evaluate(X_test, y_test)) if task == 'classification': shutil.rmtree('./structured_data_classifier') if len(np.unique(y)) > 2: print('multiclass') print(sklearn.metrics.accuracy_score(y_test, y_pred), sklearn.metrics.f1_score(y_test, y_pred, average='weighted')) #print(clf.evaluate(X_test, y_test, custom_objects=custom_obj)) #return (clf.evaluate(X_test, y_test)[0], f1_score(y_test, y_pred, average='weighted'), model_summary) return sklearn.metrics.accuracy_score( y_test, y_pred), sklearn.metrics.f1_score( y_test, y_pred, average='weighted'), model_summary else: print('binary') #print(clf.evaluate(X_test, y_test)) #return (clf.evaluate(X_test, y_test)[0], f1_score(y_test, y_pred), model_summary) return sklearn.metrics.accuracy_score( y_test, y_pred), sklearn.metrics.f1_score(y_test, y_pred), model_summary else: shutil.rmtree('./structured_data_regressor') #print(clf.evaluate(X_test, y_test)) #print(clf.evaluate(X_test, y_test)[0], r2_score(y_test, y_pred)) #return (clf.evaluate(X_test, y_test)[0], r2_score(y_test, y_pred), model_summary) return np.sqrt(sklearn.metrics.mean_squared_error( y_test, y_pred)), sklearn.metrics.r2_score(y_test, y_pred), model_summary
from sklearn.datasets import load_boston import numpy as np import tensorflow as tf import autokeras as ak datasets = load_boston() x = datasets.data y = datasets.target print(x.shape) print(y.shape) x_train, x_test, y_train,y_test = train_test_split(x, y, train_size = 0.8 , random_state = 104 ) model = ak.StructuredDataRegressor( overwrite=True, max_trials=2, # 몇번 시도할것인가 loss='mse', metrics=['mae'] ) model.fit(x_train,y_train, epochs = 10, validation_split = 0.2) results = model.evaluate(x_test, y_test) print(results) model2 = model.export_model() try: model2.save('./ak_test/boston', save_format='tf') except: model2.save('./ak_test/boston.h5') best_model = model.tuner.get_best_model()
def get_auto_model(self): return ak.StructuredDataRegressor(max_trials=10, directory=self.tmp_dir, overwrite=True)