def docker_commands(): # FIXME: doesn't allow to build docker images for custom versions of h2o return """ RUN {here}/setup.sh EXPOSE 54321 EXPOSE 54322 """.format(here=dir_of(__file__, True))
def docker_commands(*args, setup_cmd=None): return """ RUN {here}/setup.sh {args} {cmd} EXPOSE 54321 EXPOSE 54322 """.format(here=dir_of(__file__, True), args=' '.join(as_cmd_args(*args)), cmd="RUN {}".format(setup_cmd) if setup_cmd is not None else "")
def run(dataset: Dataset, config: TaskConfig): #TODO: use rpy2 instead? not necessary here though as the call is very simple log.info("\n**** Random Forest (R) ****\n") is_classification = config.type == 'classification' if not is_classification: raise ValueError('Regression is not supported.') here = dir_of(__file__) run_cmd( r"""Rscript --vanilla -e "source('{script}'); run('{train}', '{test}', '{output}', {cores})" """ .format(script=os.path.join(here, 'exec.R'), train=dataset.train.path, test=dataset.test.path, output=config.output_predictions_file, cores=config.cores)) log.info("Predictions saved to %s", config.output_predictions_file)
def run(dataset: Dataset, config: TaskConfig): with TmpDir() as tmpdir: ds = ns(train=ns(X_enc=os.path.join(tmpdir, 'train.X_enc'), y=os.path.join(tmpdir, 'train.y')), test=ns(X_enc=os.path.join(tmpdir, 'test.X_enc'), y=os.path.join(tmpdir, 'test.y'))) write_csv(dataset.train.X_enc, ds.train.X_enc), write_csv(dataset.train.y.reshape(-1, 1), ds.train.y), write_csv(dataset.test.X_enc, ds.test.X_enc), write_csv(dataset.test.y.reshape(-1, 1), ds.test.y), dataset.release() config.result_token = str(uuid.uuid1()) config.result_dir = tmpdir params = json_dumps(dict(dataset=ds, config=config), style='compact') output = run_cmd('{python} {here}/exec_proc.py'.format( python=PYTHON, here=dir_of(__file__)), _input_str_=params) out = io.StringIO(output) res = ns() for line in out: li = line.rstrip() if li == config.result_token: res = json_loads(out.readline(), as_namespace=True) break def load_data(path): return read_csv(path, as_data_frame=False, header=False) log.debug("Result from subprocess:\n%s", res) save_predictions_to_file(dataset=dataset, output_file=res.output_file, probabilities=load_data(res.probabilities) if res.probabilities is not None else None, predictions=load_data( res.predictions).squeeze(), truth=load_data(res.truth).squeeze(), target_is_encoded=res.target_is_encoded)
import io import logging import os import uuid from automl.benchmark import TaskConfig from automl.data import Dataset from automl.datautils import write_csv, read_csv from automl.results import save_predictions_to_file from automl.utils import Namespace as ns, TmpDir, dir_of, run_cmd, json_dumps, json_loads log = logging.getLogger(__name__) PYTHON = os.path.join(dir_of(__file__), 'venv/bin/python3 -W ignore') # PYTHON = 'python3 -W ignore' def run(dataset: Dataset, config: TaskConfig): with TmpDir() as tmpdir: ds = ns( train=ns( X_enc=os.path.join(tmpdir, 'train.X_enc'), y=os.path.join(tmpdir, 'train.y') ), test=ns( X_enc=os.path.join(tmpdir, 'test.X_enc'), y=os.path.join(tmpdir, 'test.y') ) ) write_csv(dataset.train.X_enc, ds.train.X_enc),
import os import signal import sys import tempfile as tmp from automl.benchmark import TaskConfig from automl.data import Dataset from automl.datautils import Encoder, impute from automl.results import save_predictions_to_file from automl.utils import InterruptTimeout, Timer, dir_of, kill_proc_tree os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir() os.environ['OMP_NUM_THREADS'] = '1' os.environ['OPENBLAS_NUM_THREADS'] = '1' os.environ['MKL_NUM_THREADS'] = '1' sys.path.append("{}/lib/hyperopt-sklearn".format(dir_of(__file__))) from hpsklearn import HyperoptEstimator, any_classifier, any_regressor from hyperopt import tpe from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, log_loss, mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score log = logging.getLogger(__name__) def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Hyperopt-sklearn ****\n") is_classification = config.type == 'classification' default = lambda: 0 metrics_to_loss_mapping = dict( acc=(default, False), # lambda y, pred: 1.0 - accuracy_score(y, pred)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** AutoWEKA ****\n") is_classification = config.type == 'classification' if not is_classification: raise ValueError('Regression is not supported.') # Mapping of benchmark metrics to Weka metrics metrics_mapping = dict(acc='errorRate', auc='areaUnderROC', logloss='kBInformation') metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if metric is None: raise ValueError("Performance metric {} not supported.".format( config.metric)) train_file = dataset.train.path test_file = dataset.test.path # Weka to requires target as the last attribute if dataset.target.index != len(dataset.predictors): train_file = reorder_dataset(dataset.train.path, target_src=dataset.target.index) test_file = reorder_dataset(dataset.test.path, target_src=dataset.target.index) training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } parallelRuns = config.framework_params.get('_parallelRuns', config.cores) memLimit = config.framework_params.get('_memLimit', 'auto') if memLimit == 'auto': memLimit = max( min(config.max_mem_size_mb, math.ceil(config.max_mem_size_mb / parallelRuns)), 1024) # AutoWEKA default memLimit log.info("Using %sMB memory per run on %s parallel runs.", memLimit, parallelRuns) f = split_path(config.output_predictions_file) f.extension = '.weka_pred.csv' weka_file = path_from_split(f) cmd_root = "java -cp {here}/lib/autoweka/autoweka.jar weka.classifiers.meta.AutoWEKAClassifier ".format( here=dir_of(__file__)) cmd_params = dict( t='"{}"'.format(train_file), T='"{}"'.format(test_file), memLimit=memLimit, classifications= '"weka.classifiers.evaluation.output.prediction.CSV -distribution -file \\\"{}\\\""' .format(weka_file), timeLimit=int(config.max_runtime_seconds / 60), parallelRuns=parallelRuns, metric=metric, seed=config.seed % (1 << 16), # weka accepts only int16 as seeds **training_params) cmd = cmd_root + ' '.join( ["-{} {}".format(k, v) for k, v in cmd_params.items()]) with Timer() as training: run_cmd(cmd) # if target values are not sorted alphabetically in the ARFF file, then class probabilities are returned in the original order # interestingly, other frameworks seem to always sort the target values first # that's why we need to specify the probabilities labels here: sorting+formatting is done in saving function probabilities_labels = dataset.target.values if not os.path.exists(weka_file): raise NoResultError("AutoWEKA failed producing any prediction.") with open(weka_file, 'r') as weka_file: probabilities = [] predictions = [] truth = [] for line in weka_file.readlines()[1:-1]: inst, actual, predicted, error, *distribution = line.split(',') pred_probabilities = [ pred_probability.replace('*', '').replace('\n', '') for pred_probability in distribution ] _, pred = predicted.split(':') _, tru = actual.split(':') probabilities.append(pred_probabilities) predictions.append(pred) truth.append(tru) save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=truth, probabilities_labels=probabilities_labels) return dict(training_duration=training.duration)
def docker_commands(*args, **kwargs): return """ RUN {here}/setup.sh """.format(here=dir_of(__file__, True))
import logging import sys from automl.benchmark import TaskConfig from automl.data import Dataset from automl.datautils import Encoder, impute from automl.results import save_predictions_to_file from automl.utils import Timer, dir_of sys.path.append("{}/libs/oboe/automl".format(dir_of(__file__))) from auto_learner import AutoLearner log = logging.getLogger(__name__) def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Oboe ****\n") is_classification = config.type == 'classification' if not is_classification: # regression currently fails (as of 29.01.2019: still under development state by oboe team) raise ValueError( 'Regression is not yet supported (under development).') X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y_enc, dataset.test.y_enc log.info('Running oboe with a maximum time of {}s on {} cores.'.format( config.max_runtime_seconds, config.cores)) log.warning( 'We completely ignore the advice to optimize towards metric: {}.'.