def run(dataset, config): log.info("\n**** Random Forest (sklearn %s) ****\n", sklearn.__version__) is_classification = config.type == 'classification' # Impute any missing data (can test using -t 146606) X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y, dataset.test.y log.info( "Running RandomForest with a maximum time of {}s on {} cores.".format( config.max_runtime_seconds, config.cores)) log.warning( "We completely ignore the requirement to stay within the time limit.") log.warning( "We completely ignore the advice to optimize towards metric: {}.". format(config.metric)) estimator = RandomForestClassifier if is_classification else RandomForestRegressor rfc = estimator(n_jobs=config.cores, **config.framework_params) rfc.fit(X_train, y_train) predictions = rfc.predict(X_test) probabilities = rfc.predict_proba(X_test) if is_classification else None return ns(output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=False)
def run(dataset: Dataset, config: TaskConfig): with TmpDir() as tmpdir: ds = ns( train=ns( X_enc=os.path.join(tmpdir, 'train.X_enc'), y=os.path.join(tmpdir, 'train.y') ), test=ns( X_enc=os.path.join(tmpdir, 'test.X_enc'), y=os.path.join(tmpdir, 'test.y') ) ) write_csv(dataset.train.X_enc, ds.train.X_enc), write_csv(dataset.train.y.reshape(-1, 1), ds.train.y), write_csv(dataset.test.X_enc, ds.test.X_enc), write_csv(dataset.test.y.reshape(-1, 1), ds.test.y), dataset.release() config.result_token = str(uuid.uuid1()) config.result_dir = tmpdir params = json_dumps(dict(dataset=ds, config=config), style='compact') output, err = run_cmd('{python} {here}/exec_proc.py'.format(python=PYTHON, here=dir_of(__file__)), _input_str_=params) out = io.StringIO(output) res = ns() for line in out: li = line.rstrip() if li == config.result_token: res = json_loads(out.readline(), as_namespace=True) break def load_data(path): return read_csv(path, as_data_frame=False, header=False) log.debug("Result from subprocess:\n%s", res) save_predictions_to_file(dataset=dataset, output_file=res.output_file, probabilities=load_data(res.probabilities) if res.probabilities is not None else None, predictions=load_data(res.predictions).squeeze(), truth=load_data(res.truth).squeeze(), target_is_encoded=res.target_is_encoded)
config_user = config_load( os.path.join(args.userdir if args.userdir is not None else config.user_dir, "config.yaml")) # config listing properties set by command line config_args = ns.parse( {'results.save': args.keep_scores}, input_dir=args.indir, output_dir=args.outdir, user_dir=args.userdir, run_mode=args.mode, script=os.path.basename(__file__), sid=sid, ) + ns.parse(extras) if args.mode != 'local': config_args + ns.parse({'monitoring.frequency_seconds': 0}) config_args = ns({k: v for k, v in config_args if v is not None}) log.debug("Config args: %s.", config_args) # merging all configuration files automl.resources.from_configs(config, config_user, config_args) try: if args.mode == 'local': bench = automl.Benchmark(args.framework, args.benchmark, parallel_jobs=args.parallel) elif args.mode == 'docker': bench = automl.DockerBenchmark(args.framework, args.benchmark, parallel_jobs=args.parallel) elif args.mode == 'aws': bench = automl.AWSBenchmark(args.framework,
return ns(output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=False) if __name__ == '__main__': params = json_loads(sys.stdin.read(), as_namespace=True) def load_data(path): return read_csv(path, as_data_frame=False, header=False) ds = ns(train=ns(X_enc=load_data(params.dataset.train.X_enc), y=load_data(params.dataset.train.y).squeeze()), test=ns( X_enc=load_data(params.dataset.test.X_enc), y=load_data(params.dataset.test.y).squeeze(), )) config = params.config config.framework_params = ns.dict(config.framework_params) result = run(ds, config) res = copy.copy(result) res.predictions = os.path.join(config.result_dir, 'predictions') res.truth = os.path.join(config.result_dir, 'truth') write_csv(result.predictions.reshape(-1, 1), res.predictions) write_csv(result.truth.reshape(-1, 1), res.truth) if result.probabilities is not None: res.probabilities = os.path.join(config.result_dir, 'probabilities') write_csv(result.probabilities, res.probabilities)