def test_simple_file_passes(self): plxfile = PolyaxonFile(os.path.abspath('tests/fixtures/simple_file.yml')) spec = plxfile.experiment_spec_at(0) assert plxfile.version == 1 assert plxfile.project.name == 'project1' assert plxfile.project_path == '/tmp/plx_logs/project1' assert plxfile.matrix is None assert plxfile.settings is None assert plxfile.run_type == RunTypes.LOCAL assert spec.environment is None assert spec.experiment_path == '/tmp/plx_logs/project1/0' assert spec.is_runnable assert spec.cluster_def == ({TaskType.MASTER: 1}, False) assert_equal_dict(spec.get_cluster().to_dict(), {TaskType.MASTER: ['127.0.0.1:10000'], TaskType.PS: [], TaskType.WORKER: []}) assert isinstance(spec.model, RegressorConfig) assert isinstance(spec.model.loss, MeanSquaredErrorConfig) assert isinstance(spec.model.optimizer, AdamConfig) assert isinstance(spec.model.graph, GraphConfig) assert len(spec.model.graph.layers) == 4 assert spec.model.graph.input_layers == [['images', 0, 0]] last_layer = spec.model.graph.layers[-1].name assert spec.model.graph.output_layers == [[last_layer, 0, 0]] assert isinstance(spec.train.data_pipeline, TFRecordImagePipelineConfig) assert spec.eval is None
def test_run_matrix_file_passes(self): plxfile = PolyaxonFile(os.path.abspath('tests/fixtures/run_exec_matrix_file.yml')) assert plxfile.version == 1 assert plxfile.project.name == 'video_prediction' assert plxfile.project_path == get_vol_path(constants.LOGS_VOLUME, RunTypes.MINIKUBE) + 'video_prediction' assert isinstance(plxfile.matrix['model'], MatrixConfig) assert plxfile.matrix['model'].to_dict() == {'values': ['CDNA', 'DNA', 'STP']} assert plxfile.matrix_space == 3 declarations = [] for loss in plxfile.matrix['model'].to_numpy(): declarations.append({'model': loss}) assert sorted( plxfile.matrix_declarations, key=lambda x: (x['model'])) == sorted( declarations, key=lambda x: (x['model'])) assert isinstance(plxfile.settings, SettingsConfig) assert plxfile.run_type == RunTypes.MINIKUBE assert len(plxfile.experiment_specs) == plxfile.matrix_space for xp in range(plxfile.matrix_space): spec = plxfile.experiment_spec_at(xp) assert spec.is_runnable assert spec.environment is None assert spec.cluster_def == ({TaskType.MASTER: 1}, False) assert spec.model is None run_exec = spec.run_exec assert isinstance(run_exec, RunExecConfig) declarations = plxfile.get_declarations_at(xp) declarations['num_masks'] = 1 if declarations['model'] == 'DNA' else 10 assert run_exec.cmd == 'video_prediction_train --model="{model}" --num_masks={num_masks}'.format( **declarations )
def test_run_simple_file_passes(self): plxfile = PolyaxonFile(os.path.abspath('tests/fixtures/run_exec_simple_file.yml')) spec = plxfile.experiment_spec_at(0) assert plxfile.version == 1 assert plxfile.project.name == 'video_prediction' assert plxfile.settings is None assert plxfile.run_type == RunTypes.LOCAL assert plxfile.project_path == "/tmp/plx_logs/video_prediction" assert spec.experiment_path == "/tmp/plx_logs/video_prediction/0" assert spec.is_runnable assert spec.environment is None assert spec.cluster_def == ({TaskType.MASTER: 1}, False) assert spec.model is None run_exec = spec.run_exec assert isinstance(run_exec, RunExecConfig) assert run_exec.cmd == "video_prediction_train --model=DNA --num_masks=1"
def test_run_matrix_file_passes(self): plxfile = PolyaxonFile( os.path.abspath('tests/fixtures/run_exec_matrix_file.yml')) spec = plxfile.specification assert spec.version == 1 assert spec.project.name == 'video_prediction' assert isinstance(spec.settings.matrix['model'], MatrixConfig) assert spec.settings.matrix['model'].to_dict() == { 'values': ['CDNA', 'DNA', 'STP'] } assert spec.matrix_space == 3 assert isinstance(spec.settings, SettingsConfig) declarations = spec.matrix_declaration_test spec = spec.get_experiment_spec(declarations) assert spec.is_runnable assert spec.environment is None assert spec.settings is not None assert spec.settings.logging is not None assert spec.cluster_def == ({TaskType.MASTER: 1}, False) assert spec.model is None run_exec = spec.run_exec assert isinstance(run_exec, RunExecConfig) declarations['num_masks'] = 1 if declarations['model'] == 'DNA' else 10 assert run_exec.cmd == ('video_prediction_train ' '--model="{model}" ' '--num_masks={num_masks}').format( **declarations)
def test_one_matrix_file_passes(self): plxfile = PolyaxonFile( os.path.abspath('tests/fixtures/one_matrix_file.yml')) spec = plxfile.specification assert spec.version == 1 assert spec.project.name == 'project1' assert spec.settings is not None assert isinstance(spec.settings.matrix['loss'], MatrixConfig) assert spec.settings.matrix['loss'].to_dict() == { 'values': ['MeanSquaredError', 'AbsoluteDifference'] } assert spec.matrix_space == 2 spec = spec.get_experiment_spec( matrix_declaration=spec.matrix_declaration_test) assert spec.is_runnable assert spec.environment is None assert spec.framework is None assert spec.cluster_def == ({TaskType.MASTER: 1}, False) model = spec.model assert isinstance(model, RegressorConfig) assert isinstance(model.loss, (MeanSquaredErrorConfig, AbsoluteDifferenceConfig)) assert isinstance(model.optimizer, AdamConfig) assert isinstance(model.graph, GraphConfig) assert len(model.graph.layers) == 4 assert model.graph.input_layers == [['images', 0, 0]] last_layer = model.graph.layers[-1].name assert model.graph.output_layers == [[last_layer, 0, 0]] assert isinstance(spec.train.data_pipeline, TFRecordImagePipelineConfig)
def start_experiment_run(polyaxonfile, experiment_id, task_type, task_id, schedule): plx_file = PolyaxonFile.read(polyaxonfile) experiment = prepare_experiment_run(plx_file, experiment_id, task_type, task_id) task = getattr(experiment, schedule) return task()
def _get_run_configs(polyaxonfile, experiment_id): plx_file = PolyaxonFile.read(polyaxonfile) environment = plx_file.get_environment_at(experiment_id) cluster_def, is_distributed = plx_file.get_cluster_def_at(experiment_id) def get_master_config(config, task_type=None, task_id=None): config = RunConfig.from_config(config) if task_type is None and task_id is None: return config return config.replace(task_type=task_type, task_id=task_id) config = environment.run_config or RunConfig.CONFIG() if not is_distributed: return {TaskType.MASTER: get_master_config(config)}, False config.cluster = plx_file.get_cluster(experiment=experiment_id) configs = { TaskType.MASTER: [get_master_config(config, TaskType.MASTER, 0)] } if cluster_def.get(TaskType.WORKER, 0) > 0: configs[TaskType.WORKER] = [] if cluster_def.get(TaskType.PS, 0) > 0: configs[TaskType.PS] = [] worker_session_configs = {} for session_config in environment.worker_configs or []: worker_session_configs[session_config.index] = session_config ps_session_configs = {} for session_config in environment.ps_configs or []: ps_session_configs[session_config.index] = session_config default_worker_config = environment.default_worker_config for i in range(cluster_def.get(TaskType.WORKER, 0)): w_config = get_master_config(config, task_type=TaskType.WORKER, task_id=i) session_config = worker_session_configs.get(i, default_worker_config) if session_config: session_config = RunConfig.get_session_config(session_config) w_config = w_config.replace(session_config=session_config) configs[TaskType.WORKER].append(w_config) default_ps_config = environment.default_ps_config for i in range(cluster_def.get(TaskType.PS, 0)): ps_config = get_master_config(config, task_type=TaskType.PS, task_id=i) session_config = ps_session_configs.get(i, default_ps_config) if session_config: session_config = RunConfig.get_session_config(session_config) ps_config = ps_config.replace(session_config=session_config) configs[TaskType.PS].append(ps_config) return configs, True
def test_matrix_file_passes(self): plxfile = PolyaxonFile(os.path.abspath('tests/fixtures/matrix_file.yml')) assert plxfile.version == 1 assert plxfile.project.name == 'project1' assert plxfile.project_path == '/tmp/plx_logs/project1' assert isinstance(plxfile.matrix['lr'], MatrixConfig) assert isinstance(plxfile.matrix['loss'], MatrixConfig) assert plxfile.matrix['lr'].to_dict() == { 'logspace': {'start': 0.01, 'stop': 0.1, 'num': 5}} assert plxfile.matrix['loss'].to_dict() == {'values': ['MeanSquaredError', 'AbsoluteDifference']} assert plxfile.matrix_space == 10 declarations = [] for lr in plxfile.matrix['lr'].to_numpy(): for loss in plxfile.matrix['loss'].to_numpy(): declarations.append({'loss': loss, 'lr': lr}) assert sorted( plxfile.matrix_declarations, key=lambda x: (x['lr'], x['loss'])) == sorted( declarations, key=lambda x: (x['lr'], x['loss'])) assert isinstance(plxfile.settings, SettingsConfig) assert plxfile.settings.concurrent_experiments == 2 assert plxfile.run_type == RunTypes.LOCAL for xp in range(plxfile.matrix_space): spec = plxfile.experiment_spec_at(xp) assert spec.is_runnable assert spec.environment is None assert spec.cluster_def == ({TaskType.MASTER: 1}, False) assert_equal_dict(spec.get_cluster().to_dict(), {TaskType.MASTER: ['127.0.0.1:10000'], TaskType.PS: [], TaskType.WORKER: []}) model = spec.model assert isinstance(model, RegressorConfig) assert isinstance(model.loss, (MeanSquaredErrorConfig, AbsoluteDifferenceConfig)) assert isinstance(model.optimizer, AdamConfig) assert isinstance(model.graph, GraphConfig) assert len(model.graph.layers) == 4 assert model.graph.input_layers == [['images', 0, 0]] last_layer = model.graph.layers[-1].name assert model.graph.output_layers == [[last_layer, 0, 0]] assert isinstance(spec.train.data_pipeline, TFRecordImagePipelineConfig)
def init(project, run, model): """Initialize a new polyaxonfile specification.""" user, project_name = get_project_or_local(project) try: project_config = PolyaxonClients().project.get_project(user, project_name) except (PolyaxonHTTPError, PolyaxonShouldExitError) as e: Printer.print_error('Make sure you have a project with this name `{}`'.format(project)) Printer.print_error('You can a new project with this command: ' 'polyaxon project create --name={} --description=...'.format(project)) Printer.print_error('Error message `{}`.'.format(e)) sys.exit(1) if not any([model, run]) and not all([model, run]): Printer.print_error("You must specify which an init option, " "possible values: `--model` or `--run`.") sys.exit(1) result = False if model: result = create_init_file(constants.INIT_FILE_MODEL) elif run: result = create_init_file(constants.INIT_FILE_RUN) if result: ProjectManager.set_config(project_config, init=True) IgnoreManager.init_config() Printer.print_success( "Project `{}` was initialized and Polyaxonfile was created successfully `{}`".format( project, constants.INIT_FILE)) sys.exit(1) # if we are here the file was not created if not os.path.isfile(constants.INIT_FILE): Printer.print_error( "Something went wrong, init command did not create a file.\n" "Possible reasons: you don't have the write to create the file.") sys.exit(1) # file was already there, let's check if the project passed correspond to this file try: PolyaxonFile(constants.INIT_FILE).specification except (PolyaxonfileError, ValidationError) as e: Printer.print_error( "Something went wrong, init command did not create a file.\n" "Another file already exist with.") Printer.print_error('Error message: `{}`.'.format(e)) sys.exit(1) # At this point we check if we need to re init configurations ProjectManager.set_config(project_config, init=True) IgnoreManager.init_config() Printer.print_success( "Project `{}` was initialized and Polyaxonfile was created successfully `{}`".format( project, constants.INIT_FILE))
def run(polyaxonfile): plx_file = PolyaxonFile.read(polyaxonfile) for xp in range(plx_file.matrix_space): run_experiment(plx_file.experiment_specs[xp], xp) while not current_run['finished']: check_master_process() time.sleep(10) current_run['finished'] = False current_run['master'] = None
def test_matrix_early_stopping_file_passes(self): plxfile = PolyaxonFile( os.path.abspath('tests/fixtures/matrix_file_early_stopping.yml')) spec = plxfile.specification assert spec.version == 1 assert spec.project.name == 'project1' assert isinstance(spec.settings.matrix['lr'], MatrixConfig) assert isinstance(spec.settings.matrix['loss'], MatrixConfig) assert spec.settings.matrix['lr'].to_dict() == { 'logspace': { 'start': 0.01, 'stop': 0.1, 'num': 5 } } assert spec.settings.matrix['loss'].to_dict() == { 'values': ['MeanSquaredError', 'AbsoluteDifference'] } assert spec.matrix_space == 10 assert isinstance(spec.settings, SettingsConfig) assert spec.settings.concurrent_experiments == 2 assert spec.settings.random_search.n_experiments == 5 assert spec.early_stopping == spec.settings.early_stopping assert len(spec.settings.early_stopping) == 1 assert isinstance(spec.settings.early_stopping[0], EarlyStoppingMetricConfig) # assert spec.experiments_def == ( # 10, # 5, # 2, # SearchAlgorithms.RANDOM # ) spec = spec.get_experiment_spec( matrix_declaration=spec.matrix_declaration_test) assert spec.is_runnable assert spec.environment is None assert spec.framework is None assert spec.cluster_def == ({TaskType.MASTER: 1}, False) model = spec.model assert isinstance(model, RegressorConfig) assert isinstance(model.loss, (MeanSquaredErrorConfig, AbsoluteDifferenceConfig)) assert isinstance(model.optimizer, AdamConfig) assert isinstance(model.graph, GraphConfig) assert len(model.graph.layers) == 4 assert model.graph.input_layers == [['images', 0, 0]] last_layer = model.graph.layers[-1].name assert model.graph.output_layers == [[last_layer, 0, 0]] assert isinstance(spec.train.data_pipeline, TFRecordImagePipelineConfig)
def prepare_all_experiment_runs(polyaxonfile, experiment_id): plx_file = PolyaxonFile.read(polyaxonfile) is_distributed = False if not plx_file.get_environment_at(experiment_id): tf.logging.set_verbosity(tf.logging.INFO) configs = {TaskType.MASTER: [RunConfig()]} delay_workers_by_global_step = False else: tf.logging.set_verbosity( LOGGING_LEVEL[plx_file.settings.logging.level]) configs, is_distributed = _get_run_configs( plx_file.settings.environment, experiment_id) delay_workers_by_global_step = plx_file.settings.environment.delay_workers_by_global_step train_input_fn, train_steps, train_hooks = _get_train( plx_file.get_train_at(experiment_id)) (eval_input_fn, eval_steps, eval_hooks, eval_delay_secs, continuous_eval_throttle_secs) = _get_eval( plx_file.get_eval_at(experiment_id)) def get_experiment(config): estimator = getters.get_estimator(plx_file.model, config, output_dir=plx_file.project_path) return Experiment( estimator=estimator, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, train_steps=train_steps, eval_steps=eval_steps, train_hooks=train_hooks, eval_hooks=eval_hooks, eval_delay_secs=eval_delay_secs, continuous_eval_throttle_secs=continuous_eval_throttle_secs, delay_workers_by_global_step=delay_workers_by_global_step, export_strategies=plx_file.settings.export_strategies) xps = [get_experiment(configs[TaskType.MASTER][0])] if not is_distributed: return xps for i_config in configs.get(TaskType.WORKER, []): xps.append(get_experiment(i_config)) for i_config in configs.get(TaskType.PS, []): xps.append(get_experiment(i_config)) return xps
def test_run_simple_file_passes(self): plxfile = PolyaxonFile( os.path.abspath('tests/fixtures/run_exec_simple_file.yml')) spec = plxfile.specification assert spec.version == 1 assert spec.project.name == 'video_prediction' assert spec.settings is None assert spec.is_runnable assert spec.environment is None assert spec.framework is None assert spec.cluster_def == ({TaskType.MASTER: 1}, False) assert spec.model is None run_exec = spec.run_exec assert isinstance(run_exec, RunExecConfig) assert run_exec.cmd == "video_prediction_train --model=DNA --num_masks=1"
def test_simple_generator_file_passes(self): plxfile = PolyaxonFile(os.path.abspath('tests/fixtures/simple_generator_file.yml')) spec = plxfile.experiment_spec_at(0) assert plxfile.matrix is None assert plxfile.version == 1 assert plxfile.project.name == 'project1' assert plxfile.project_path == '/tmp/plx_logs/project1' assert plxfile.settings is None assert plxfile.run_type == RunTypes.LOCAL assert spec.experiment_path == '/tmp/plx_logs/project1/0' assert spec.environment is None assert spec.is_runnable assert spec.cluster_def == ({TaskType.MASTER: 1}, False) assert_equal_dict(spec.get_cluster().to_dict(), {TaskType.MASTER: ['127.0.0.1:10000'], TaskType.PS: [], TaskType.WORKER: []}) assert isinstance(spec.model, GeneratorConfig) assert isinstance(spec.model.loss, MeanSquaredErrorConfig) assert isinstance(spec.model.optimizer, AdamConfig) assert isinstance(spec.model.encoder, GraphConfig) assert isinstance(spec.model.decoder, GraphConfig) assert isinstance(spec.model.bridge, NoOpBridgeConfig) assert isinstance(spec.train.data_pipeline, TFRecordImagePipelineConfig) assert spec.eval is None
def run_all(polyaxonfile): plx_file = PolyaxonFile.read(polyaxonfile) for xp in range(plx_file.matrix_space): xp_runs = prepare_all_experiment_runs(polyaxonfile, xp) for i, xp_run in enumerate(xp_runs): if i == 0: schedule = 'train_and_evaluate' else: schedule = 'train' p = Process(target=getattr(xp_run, schedule)) p.start() jobs.append(p) for job in jobs: job.join()
def check_polyaxonfile(file): file = to_list(file) exists = [os.path.isfile(f) for f in file] if not any(exists): Printer.print_error('Polyaxonfile is not present, ' 'please run {}'.format(constants.INIT_COMMAND)) sys.exit(1) try: plx_file = PolyaxonFile.read(file) Printer.print_success("Polyaxonfile valid") return plx_file except Exception as e: Printer.print_error("Polyaxonfile is not valid") sys.exit(1)
def prepare_experiment_run(polyaxonfile, experiment_id, task_type=TaskType.MASTER, task_id=0): plx_file = PolyaxonFile.read(polyaxonfile) cluster, _ = plx_file.get_cluster_def_at(experiment_id) if (task_type not in cluster or not isinstance(cluster[task_type], int) or task_id >= cluster[task_type]): raise ValueError('task_type, task_id `{}, {}` is not supported by ' 'the specification file passed.'.format( task_type, task_id)) env = plx_file.get_environment_at(experiment_id) if not env: tf.logging.set_verbosity(tf.logging.INFO) configs = {TaskType.MASTER: [RunConfig()]} delay_workers_by_global_step = False else: tf.logging.set_verbosity( LOGGING_LEVEL[plx_file.settings.logging.level]) configs, _ = _get_run_configs(plx_file, experiment_id) delay_workers_by_global_step = env.delay_workers_by_global_step train_input_fn, train_steps, train_hooks = _get_train( plx_file.get_train_at(experiment_id)) (eval_input_fn, eval_steps, eval_hooks, eval_delay_secs, continuous_eval_throttle_secs) = _get_eval( plx_file.get_eval_at(experiment_id)) estimator = getters.get_estimator( plx_file.get_model_at(experiment_id), configs[task_type][task_id], output_dir=plx_file.get_project_path_at(experiment_id)) return Experiment( estimator=estimator, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, train_steps=train_steps, eval_steps=eval_steps, train_hooks=train_hooks, eval_hooks=eval_hooks, eval_delay_secs=eval_delay_secs, continuous_eval_throttle_secs=continuous_eval_throttle_secs, delay_workers_by_global_step=delay_workers_by_global_step, export_strategies=plx_file.settings.export_strategies)
def check(file, version, cluster, run_type): """Command for checking a polyaxonfile.""" plx_file = PolyaxonFile(file) logger.info("Polyaxonfile valid") if version: logger.info('The version is: {}'.format(plx_file.version)) elif cluster: cluster_def, is_distributed = plx_file.cluster_def logger.info('The cluster definition is: {}'.format(cluster_def)) elif run_type: logger.info('The run_type is: {}'.format(plx_file.run_type)) else: logger.info('Validated file:\n{}'.format(plx_file.parsed_data))
def check_polyaxonfile(file, log=True): # pylint:disable=redefined-builtin file = to_list(file) exists = [os.path.isfile(f) for f in file] if not any(exists): Printer.print_error('Polyaxonfile is not present, ' 'please run {}'.format(constants.INIT_COMMAND)) sys.exit(1) try: plx_file = PolyaxonFile(file) if log: Printer.print_success("Polyaxonfile valid") return plx_file except Exception as e: Printer.print_error("Polyaxonfile is not valid ") Printer.print_error('Error message `{}`.'.format(e)) sys.exit(1)
def test_simple_generator_file_passes(self): plxfile = PolyaxonFile( os.path.abspath('tests/fixtures/simple_generator_file.yml')) spec = plxfile.specification assert spec.version == 1 assert spec.project.name == 'project1' assert spec.settings is None assert spec.environment is None assert spec.framework is None assert spec.is_runnable assert spec.cluster_def == ({TaskType.MASTER: 1}, False) assert isinstance(spec.model, GeneratorConfig) assert isinstance(spec.model.loss, MeanSquaredErrorConfig) assert isinstance(spec.model.optimizer, AdamConfig) assert isinstance(spec.model.encoder, GraphConfig) assert isinstance(spec.model.decoder, GraphConfig) assert isinstance(spec.model.bridge, NoOpBridgeConfig) assert isinstance(spec.train.data_pipeline, TFRecordImagePipelineConfig) assert spec.eval is None
def run(file): """Command for running a polyaxonfile.""" plx_file = PolyaxonFile(file) if plx_file.run_type == RunTypes.LOCAL: # check that polyaxon is installed version = get_version(PROJECT_NAME) if version is None: click.echo("""In order to run locally, polyaxon must be installed.""") if click.confirm("Do you want to install polyaxon now?"): from polyaxon_cli.cli.version import pip_upgrade pip_upgrade(PROJECT_NAME) else: click.echo("""Your can manually run: pip install -U polyaxon to install to the latest version of polyaxon)""") sys.exit(0) logger.info('Running polyaxonfile locally') from polyaxon.polyaxonfile.local_runner import run run(file)
def test_simple_file_passes(self): plxfile = PolyaxonFile( os.path.abspath('tests/fixtures/simple_file.yml')) spec = plxfile.specification assert spec.version == 1 assert spec.project.name == 'project1' assert spec.settings is None assert spec.environment is None assert spec.framework is None assert spec.is_runnable assert spec.cluster_def == ({TaskType.MASTER: 1}, False) assert isinstance(spec.model, RegressorConfig) assert isinstance(spec.model.loss, MeanSquaredErrorConfig) assert isinstance(spec.model.optimizer, AdamConfig) assert isinstance(spec.model.graph, GraphConfig) assert len(spec.model.graph.layers) == 4 assert spec.model.graph.input_layers == [['images', 0, 0]] last_layer = spec.model.graph.layers[-1].name assert spec.model.graph.output_layers == [[last_layer, 0, 0]] assert isinstance(spec.train.data_pipeline, TFRecordImagePipelineConfig) assert spec.eval is None
def run(polyaxonfile): plx_file = PolyaxonFile.read(polyaxonfile) for xp in range(plx_file.matrix_space): logging.info("running Experiment n: {}".format(xp)) cluster, is_distributed = plx_file.get_cluster_def_at(xp) if not is_distributed: start_experiment_run(plx_file, xp, TaskType.MASTER, 0, 'continuous_train_and_eval') current_run['finished'] = True else: env = { 'polyaxonfile': polyaxonfile, 'task_type': TaskType.MASTER, 'task_id': 0, 'schedule': 'train_and_evaluate' } create_process(env) for i in xrange(cluster.get(TaskType.WORKER, 0)): env['task_id'] = i env['task_type'] = TaskType.WORKER env['schedule'] = 'train' create_process(env) for i in xrange(cluster.get(TaskType.PS, 0)): env['task_id'] = i env['task_type'] = TaskType.PS env['schedule'] = 'run_std_server' create_process(env) for job in jobs: job.join() while not current_run['finished']: time.sleep(30) current_run['finished'] = False
def test_advanced_file_passes(self): plxfile = PolyaxonFile(os.path.abspath('tests/fixtures/advanced_file.yml')) assert plxfile.version == 1 assert plxfile.project.name == 'project1' assert plxfile.project_path == '/mypath/project1' assert plxfile.matrix is None assert plxfile.run_type == RunTypes.MINIKUBE assert isinstance(plxfile.settings, SettingsConfig) assert isinstance(plxfile.settings.logging, LoggingConfig) spec = plxfile.experiment_spec_at(0) assert spec.is_runnable assert isinstance(spec.environment, EnvironmentConfig) assert spec.environment.n_workers == 5 assert spec.environment.n_ps == 10 assert spec.environment.delay_workers_by_global_step is True assert isinstance(spec.environment.run_config, RunConfig) assert spec.environment.run_config.tf_random_seed == 100 assert spec.environment.run_config.save_summary_steps == 100 assert spec.environment.run_config.save_checkpoints_secs == 60 assert isinstance(spec.environment.run_config.session, SessionConfig) assert spec.environment.run_config.session.allow_soft_placement is True assert spec.environment.run_config.session.intra_op_parallelism_threads == 2 assert spec.environment.run_config.session.inter_op_parallelism_threads == 2 # check properties for returning worker configs and resources assert spec.environment.worker_configs is None assert spec.environment.ps_configs is None assert spec.environment.resources is None assert spec.environment.worker_resources is None assert spec.environment.ps_resources is None assert spec.worker_configs == {} assert spec.ps_configs == {} assert spec.worker_resources == {} assert spec.ps_resources == {} assert spec.cluster_def == ({TaskType.MASTER: 1, TaskType.WORKER: 5, TaskType.PS: 10}, True) def task_name(task_type, task_idx): return constants.TASK_NAME.format(project=plxfile.project.name, experiment=0, task_type=task_type, task_idx=task_idx) assert_equal_dict(spec.get_cluster().to_dict(), {TaskType.MASTER: ['{}:2222'.format(task_name(TaskType.MASTER, 0))], TaskType.WORKER: [ '{}:2222'.format(task_name(TaskType.WORKER, 0)), '{}:2222'.format(task_name(TaskType.WORKER, 1)), '{}:2222'.format(task_name(TaskType.WORKER, 2)), '{}:2222'.format(task_name(TaskType.WORKER, 3)), '{}:2222'.format(task_name(TaskType.WORKER, 4)), ], TaskType.PS: [ '{}:2222'.format(task_name(TaskType.PS, 0)), '{}:2222'.format(task_name(TaskType.PS, 1)), '{}:2222'.format(task_name(TaskType.PS, 2)), '{}:2222'.format(task_name(TaskType.PS, 3)), '{}:2222'.format(task_name(TaskType.PS, 4)), '{}:2222'.format(task_name(TaskType.PS, 5)), '{}:2222'.format(task_name(TaskType.PS, 6)), '{}:2222'.format(task_name(TaskType.PS, 7)), '{}:2222'.format(task_name(TaskType.PS, 8)), '{}:2222'.format(task_name(TaskType.PS, 9)), ]}) assert isinstance(spec.model, ClassifierConfig) assert isinstance(spec.model.loss, MeanSquaredErrorConfig) assert isinstance(spec.model.optimizer, AdamConfig) assert spec.model.optimizer.learning_rate == 0.21 assert isinstance(spec.model.graph, GraphConfig) assert len(spec.model.graph.layers) == 7 assert spec.model.graph.input_layers == [['images', 0, 0]] assert len(spec.model.graph.output_layers) == 3 assert ['super_dense', 0, 0] in spec.model.graph.output_layers assert isinstance(spec.train.data_pipeline, TFRecordImagePipelineConfig) assert len(spec.train.data_pipeline.feature_processors.feature_processors) == 1 assert isinstance(spec.eval.data_pipeline, TFRecordImagePipelineConfig) assert spec.eval.data_pipeline.feature_processors is None
def test_advanced_file_with_custom_configs_and_resources_passes(self): plxfile = PolyaxonFile( os.path.abspath( 'tests/fixtures/advanced_file_with_custom_configs_and_resources.yml' )) spec = plxfile.specification assert spec.version == 1 assert spec.project.name == 'project1' assert isinstance(spec.settings, SettingsConfig) assert isinstance(spec.settings.logging, LoggingConfig) assert spec.settings.matrix is None assert isinstance(spec.environment, EnvironmentConfig) assert spec.is_runnable assert spec.framework == Frameworks.TENSORFLOW assert spec.environment.tensorflow.n_workers == 5 assert spec.environment.tensorflow.n_ps == 10 assert spec.environment.tensorflow.delay_workers_by_global_step is True assert isinstance(spec.environment.tensorflow.run_config, RunConfig) assert spec.environment.tensorflow.run_config.tf_random_seed == 100 assert spec.environment.tensorflow.run_config.save_summary_steps == 100 assert spec.environment.tensorflow.run_config.save_checkpoints_secs == 60 assert isinstance(spec.environment.resources, PodResourcesConfig) assert isinstance(spec.environment.resources.cpu, K8SResourcesConfig) assert spec.environment.resources.cpu.requests == 1 assert spec.environment.resources.cpu.limits == 2 assert isinstance(spec.environment.tensorflow.run_config.session, SessionConfig) assert spec.environment.tensorflow.run_config.session.allow_soft_placement is True assert spec.environment.tensorflow.run_config.session.intra_op_parallelism_threads == 2 assert spec.environment.tensorflow.run_config.session.inter_op_parallelism_threads == 2 assert isinstance(spec.environment.tensorflow.default_worker_config, SessionConfig) assert spec.environment.tensorflow.default_worker_config.allow_soft_placement is True assert spec.environment.tensorflow.default_worker_config.intra_op_parallelism_threads == 1 assert spec.environment.tensorflow.default_worker_config.inter_op_parallelism_threads == 1 assert isinstance(spec.environment.tensorflow.worker_configs[0], SessionConfig) assert spec.environment.tensorflow.worker_configs[0].index == 3 assert spec.environment.tensorflow.worker_configs[ 0].allow_soft_placement is False assert spec.environment.tensorflow.worker_configs[ 0].intra_op_parallelism_threads == 5 assert spec.environment.tensorflow.worker_configs[ 0].inter_op_parallelism_threads == 5 assert spec.environment.tensorflow.ps_configs is None assert spec.environment.tensorflow.worker_resources is None assert isinstance(spec.environment.tensorflow.default_ps_resources, PodResourcesConfig) assert isinstance(spec.environment.tensorflow.default_ps_resources.cpu, K8SResourcesConfig) assert spec.environment.tensorflow.default_ps_resources.cpu.requests == 2 assert spec.environment.tensorflow.default_ps_resources.cpu.limits == 4 assert isinstance(spec.environment.tensorflow.ps_resources[0], PodResourcesConfig) assert isinstance(spec.environment.tensorflow.ps_resources[0].memory, K8SResourcesConfig) assert spec.environment.tensorflow.ps_resources[0].index == 9 assert spec.environment.tensorflow.ps_resources[ 0].memory.requests == 512 assert spec.environment.tensorflow.ps_resources[ 0].memory.limits == 1024 # check that properties for return list of configs and resources is working cluster, is_distributed = spec.cluster_def worker_configs = TensorflowSpecification.get_worker_configs( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) assert len(worker_configs) == spec.environment.tensorflow.n_workers assert set(worker_configs.values()) == { spec.environment.tensorflow.default_worker_config, spec.environment.tensorflow.worker_configs[0] } assert TensorflowSpecification.get_ps_configs( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) == {} assert TensorflowSpecification.get_worker_resources( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) == {} ps_resources = TensorflowSpecification.get_ps_resources( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) assert len(ps_resources) == spec.environment.tensorflow.n_ps assert set(ps_resources.values()) == { spec.environment.tensorflow.default_ps_resources, spec.environment.tensorflow.ps_resources[0] } # Check total resources assert spec.total_resources == { 'cpu': { 'requests': 1 + 2 * 9, 'limits': 2 + 4 * 9 }, 'memory': { 'requests': 512, 'limits': 1024 }, 'gpu': None } assert spec.cluster_def == ({ TaskType.MASTER: 1, TaskType.WORKER: 5, TaskType.PS: 10 }, True) assert isinstance(spec.model, ClassifierConfig) assert isinstance(spec.model.loss, MeanSquaredErrorConfig) assert isinstance(spec.model.optimizer, AdamConfig) assert spec.model.optimizer.learning_rate == 0.21 assert isinstance(spec.model.graph, GraphConfig) assert len(spec.model.graph.layers) == 7 assert spec.model.graph.input_layers == [['images', 0, 0]] assert len(spec.model.graph.output_layers) == 3 assert ['super_dense', 0, 0] in spec.model.graph.output_layers assert isinstance(spec.train.data_pipeline, TFRecordImagePipelineConfig) assert len(spec.train.data_pipeline.feature_processors. feature_processors) == 1 assert isinstance(spec.eval.data_pipeline, TFRecordImagePipelineConfig) assert spec.eval.data_pipeline.feature_processors is None
def test_missing_project_raises(self): with self.assertRaises(PolyaxonfileError): PolyaxonFile(os.path.abspath('tests/fixtures/missing_project.yml'))
def test_distributed_mxnet_passes(self): plxfile = PolyaxonFile( os.path.abspath('tests/fixtures/distributed_mxnet_file.yml')) spec = plxfile.specification assert spec.version == 1 assert spec.project.name == 'project1' assert isinstance(spec.settings, SettingsConfig) assert isinstance(spec.settings.logging, LoggingConfig) assert spec.settings.matrix is None assert isinstance(spec.environment, EnvironmentConfig) assert spec.is_runnable assert spec.framework == Frameworks.MXNET assert spec.environment.mxnet.n_workers == 5 assert spec.environment.mxnet.n_ps == 10 assert isinstance(spec.environment.resources, PodResourcesConfig) assert isinstance(spec.environment.resources.cpu, K8SResourcesConfig) assert spec.environment.resources.cpu.requests == 1 assert spec.environment.resources.cpu.limits == 2 assert isinstance(spec.environment.mxnet.default_worker_resources, PodResourcesConfig) assert isinstance(spec.environment.mxnet.default_worker_resources.cpu, K8SResourcesConfig) assert spec.environment.mxnet.default_worker_resources.cpu.requests == 3 assert spec.environment.mxnet.default_worker_resources.cpu.limits == 3 assert isinstance( spec.environment.mxnet.default_worker_resources.memory, K8SResourcesConfig) assert spec.environment.mxnet.default_worker_resources.memory.requests == 256 assert spec.environment.mxnet.default_worker_resources.memory.limits == 256 assert isinstance(spec.environment.mxnet.worker_resources[0], PodResourcesConfig) assert isinstance(spec.environment.mxnet.worker_resources[0].memory, K8SResourcesConfig) assert spec.environment.mxnet.worker_resources[0].index == 3 assert spec.environment.mxnet.worker_resources[ 0].memory.requests == 300 assert spec.environment.mxnet.worker_resources[0].memory.limits == 300 assert isinstance(spec.environment.mxnet.default_ps_resources, PodResourcesConfig) assert isinstance(spec.environment.mxnet.default_ps_resources.cpu, K8SResourcesConfig) assert spec.environment.mxnet.default_ps_resources.cpu.requests == 2 assert spec.environment.mxnet.default_ps_resources.cpu.limits == 4 assert isinstance(spec.environment.mxnet.ps_resources[0], PodResourcesConfig) assert isinstance(spec.environment.mxnet.ps_resources[0].memory, K8SResourcesConfig) assert spec.environment.mxnet.ps_resources[0].index == 9 assert spec.environment.mxnet.ps_resources[0].memory.requests == 512 assert spec.environment.mxnet.ps_resources[0].memory.limits == 1024 # check that properties for return list of configs and resources is working cluster, is_distributed = spec.cluster_def worker_resources = MXNetSpecification.get_worker_resources( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) assert len(worker_resources) == spec.environment.mxnet.n_workers assert set(worker_resources.values()) == { spec.environment.mxnet.default_worker_resources, spec.environment.mxnet.worker_resources[0] } ps_resources = MXNetSpecification.get_ps_resources( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) assert len(ps_resources) == spec.environment.mxnet.n_ps assert set(ps_resources.values()) == { spec.environment.mxnet.default_ps_resources, spec.environment.mxnet.ps_resources[0] } # Check total resources assert spec.total_resources == { 'cpu': { 'requests': 1 + 3 * 4 + 2 * 9, 'limits': 2 + 3 * 4 + 4 * 9 }, 'memory': { 'requests': 300 + 256 * 4 + 512, 'limits': 300 + 256 * 4 + 1024 }, 'gpu': None } assert spec.cluster_def == ({ TaskType.MASTER: 1, TaskType.WORKER: 5, TaskType.SERVER: 10 }, True)
def test_wrong_project_name_raises(self): with self.assertRaises(PolyaxonfileError): PolyaxonFile(os.path.abspath('tests/fixtures/wrong_project_name.yml'))
def test_advanced_file_passes(self): plxfile = PolyaxonFile( os.path.abspath('tests/fixtures/advanced_file.yml')) spec = plxfile.specification assert spec.version == 1 assert spec.project.name == 'project1' assert isinstance(spec.settings, SettingsConfig) assert isinstance(spec.settings.logging, LoggingConfig) assert spec.settings.matrix is None assert spec.is_runnable assert isinstance(spec.environment, EnvironmentConfig) assert spec.framework == Frameworks.TENSORFLOW assert spec.environment.tensorflow.n_workers == 5 assert spec.environment.tensorflow.n_ps == 10 assert spec.environment.tensorflow.delay_workers_by_global_step is True assert isinstance(spec.environment.tensorflow.run_config, RunConfig) assert spec.environment.tensorflow.run_config.tf_random_seed == 100 assert spec.environment.tensorflow.run_config.save_summary_steps == 100 assert spec.environment.tensorflow.run_config.save_checkpoints_secs == 60 assert isinstance(spec.environment.tensorflow.run_config.session, SessionConfig) assert spec.environment.tensorflow.run_config.session.allow_soft_placement is True assert spec.environment.tensorflow.run_config.session.intra_op_parallelism_threads == 2 assert spec.environment.tensorflow.run_config.session.inter_op_parallelism_threads == 2 # check properties for returning worker configs and resources assert spec.environment.tensorflow.worker_configs is None assert spec.environment.tensorflow.ps_configs is None assert spec.environment.tensorflow.worker_resources is None assert spec.environment.tensorflow.ps_resources is None cluster, is_distributed = spec.cluster_def assert TensorflowSpecification.get_worker_configs( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) == {} assert TensorflowSpecification.get_ps_configs( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) == {} assert TensorflowSpecification.get_worker_resources( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) == {} assert TensorflowSpecification.get_ps_resources( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) == {} assert spec.cluster_def == ({ TaskType.MASTER: 1, TaskType.WORKER: 5, TaskType.PS: 10 }, True) assert isinstance(spec.model, ClassifierConfig) assert isinstance(spec.model.loss, MeanSquaredErrorConfig) assert isinstance(spec.model.optimizer, AdamConfig) assert spec.model.optimizer.learning_rate == 0.21 assert isinstance(spec.model.graph, GraphConfig) assert len(spec.model.graph.layers) == 7 assert spec.model.graph.input_layers == [['images', 0, 0]] assert len(spec.model.graph.output_layers) == 3 assert ['super_dense', 0, 0] in spec.model.graph.output_layers assert isinstance(spec.train.data_pipeline, TFRecordImagePipelineConfig) assert len(spec.train.data_pipeline.feature_processors. feature_processors) == 1 assert isinstance(spec.eval.data_pipeline, TFRecordImagePipelineConfig) assert spec.eval.data_pipeline.feature_processors is None
def test_advanced_file_passes(self): PolyaxonFile(os.path.abspath('tests/fixtures/advanced_file.yml'))