def test_restart_on_sudden_instance_termination(training_finished, launch_train, spot_terminated, caplog): class DummyInstance: id = 1 launch_train.return_value = 0 # setup the AWS worker event_config = read_config(ramp_aws_config_template())['worker'] worker = AWSWorker(event_config, submission='starting_kit_local') worker.config = event_config worker.submission = 'dummy submissions' worker.instance = DummyInstance # set the submission did not yet finish training training_finished.return_value = False spot_terminated.return_value = False worker.launch_submission() assert worker.status == 'running' assert caplog.text == '' # call CalledProcessError on checking if submission was finished training_finished.side_effect = subprocess.CalledProcessError(255, 'test') # make sure that the worker status is set to 'retry' assert worker.status == 'retry' assert 'Unable to connect to the instance' in caplog.text assert 'Adding the submission back to the queue' in caplog.text
def test_aws_worker(): if not os.path.isfile(os.path.join(HERE, 'config.yml')): pytest.skip("Only for local tests for now") ramp_kit_dir = os.path.join(HERE, 'kits', 'iris') # make sure predictio and log dirs exist, if not, add them add_empty_dir(os.path.join(ramp_kit_dir, 'predictions')) add_empty_dir(os.path.join(ramp_kit_dir, 'logs')) # if the prediction / log files are still there, remove them for subdir in os.listdir(os.path.join(ramp_kit_dir, 'predictions')): if os.path.isdir(subdir): shutil.rmtree(subdir) for subdir in os.listdir(os.path.join(ramp_kit_dir, 'logs')): if os.path.isdir(subdir): shutil.rmtree(subdir) config = read_config(os.path.join(HERE, 'config.yml')) worker_config = generate_worker_config(config) worker = AWSWorker(worker_config, submission='starting_kit_local') worker.setup() assert worker.status == 'setup' worker.launch_submission() assert worker.status in ('running', 'finished') worker.collect_results() assert worker.status == 'collected' assert os.path.isdir( os.path.join(ramp_kit_dir, 'predictions', 'starting_kit_local', 'fold_0')) assert os.path.isfile( os.path.join(ramp_kit_dir, 'logs', 'starting_kit_local', 'log')) worker.teardown() assert worker.status == 'killed'
def test_aws_worker_launch_train_error(launch_train, caplog): # mock dummy AWS instance class DummyInstance: id = 1 launch_train.side_effect = subprocess.CalledProcessError(255, 'test') # setup the AWS worker event_config = read_config(ramp_aws_config_template())['worker'] worker = AWSWorker(event_config, submission='starting_kit_local') worker.config = event_config worker.submission = 'dummy submissions' worker.instance = DummyInstance # CalledProcessError is thrown inside status = worker.launch_submission() assert 'test' in caplog.text assert 'Cannot start training of submission' in caplog.text assert worker.status == 'error' assert status == 1