def restart(ctx, job_name, data, open_notebook, env, message, gpu, cpu, gpup, cpup, command): """ Restart a given job as a new job. """ parameters = {} expt_client = ExperimentClient() try: job = expt_client.get(normalize_job_name(job_name)) except FloydException: job = expt_client.get(job_name) if gpup: instance_type = G1P_INSTANCE_TYPE elif cpup: instance_type = C1P_INSTANCE_TYPE elif gpu: instance_type = G1_INSTANCE_TYPE elif cpu: instance_type = C1_INSTANCE_TYPE else: instance_type = job.instance_type if instance_type is not None: parameters['instance_type'] = instance_type else: instance_type = job.instance_type if env is not None: if not validate_env(env, instance_type): sys.exit(1) parameters['env'] = env success, data_ids = process_data_ids(data) if not success: sys.exit(1) if message: parameters['message'] = message if command: parameters['command'] = ' '.join(command) floyd_logger.info('Restarting job %s...', job_name) new_job_info = expt_client.restart(job.id, parameters=parameters) if not new_job_info: floyd_logger.error("Failed to restart job") sys.exit(1) floyd_logger.info('New job created:') table_output = [["JOB NAME"], [new_job_info['name']]] floyd_logger.info('\n' + tabulate(table_output, headers="firstrow") + '\n') show_new_job_info(expt_client, new_job_info['name'], new_job_info, job.mode, open_notebook)
def run(ctx, gpu, env, message, data, mode, open, tensorboard, command): """ Run a command on Floyd. Floyd will upload contents of the current directory and run your command remotely. This command will generate a run id for reference. """ experiment_config = ExperimentConfigManager.get_config() if not ProjectClient().exists(experiment_config.family_id): floyd_logger.error( 'Invalid project id, please run ' '"floyd init PROJECT_NAME" before scheduling a job.') return access_token = AuthConfigManager.get_access_token() experiment_name = "{}/{}".format(access_token.username, experiment_config.name) # Create module if len(data) > 5: floyd_logger.error("Cannot attach more than 5 datasets to an job") return # Get the data entity from the server to: # 1. Confirm that the data id or uri exists and has the right permissions # 2. If uri is used, get the id of the dataset data_ids = [] for data_name_or_id in data: path = None if ':' in data_name_or_id: data_name_or_id, path = data_name_or_id.split(':') data_obj = DataClient().get(data_name_or_id) if not data_obj: floyd_logger.error( "Data not found for name or id: {}".format(data_name_or_id)) return data_ids.append( "{}:{}".format(data_obj.id, path) if path else data_obj.id) default_name = 'input' if len(data_ids) <= 1 else None module_inputs = [{ 'name': get_data_name(data_str, default_name), 'type': 'dir' } for data_str in data_ids] if gpu: arch = 'gpu' instance_type = GPU_INSTANCE_TYPE else: arch = 'cpu' instance_type = CPU_INSTANCE_TYPE env_map = EnvClient().get_all() envs = env_map.get(arch) if envs: if env not in envs: floyd_logger.error( "{} is not in the list of supported environments: {}".format( env, ', '.join(envs.keys()))) return else: floyd_logger.error("{} is not a supported architecture".format(arch)) return command_str = ' '.join(command) module = Module(name=experiment_name, description=message or '', command=command_str, mode=get_mode_parameter(mode), enable_tensorboard=tensorboard, family_id=experiment_config.family_id, inputs=module_inputs, env=env, arch=arch) from floyd.exceptions import BadRequestException try: module_id = ModuleClient().create(module) except BadRequestException as e: if 'Project not found, ID' in e.message: floyd_logger.error( 'ERROR: Please run "floyd init PROJECT_NAME" before scheduling a job.' ) else: floyd_logger.error('ERROR: %s', e.message) sys.exit(1) floyd_logger.debug("Created module with id : {}".format(module_id)) # Create experiment request # Get the actual command entered in the command line full_command = get_command_line(gpu, env, message, data, mode, open, tensorboard, command) experiment_request = ExperimentRequest( name=experiment_name, description=message, full_command=full_command, module_id=module_id, data_ids=data_ids, family_id=experiment_config.family_id, instance_type=instance_type) expt_cli = ExperimentClient() expt_info = expt_cli.create(experiment_request) floyd_logger.debug("Created job : {}".format(expt_info['id'])) table_output = [["JOB NAME"], [expt_info['name']]] floyd_logger.info(tabulate(table_output, headers="firstrow")) floyd_logger.info("") if mode in ['jupyter', 'serve']: while True: # Wait for the experiment / task instances to become available try: experiment = expt_cli.get(expt_info['id']) if experiment.task_instances: break except Exception: floyd_logger.debug("Job not available yet: {}".format( expt_info['id'])) floyd_logger.debug("Job not available yet: {}".format( expt_info['id'])) sleep(3) continue # Print the path to jupyter notebook if mode == 'jupyter': jupyter_url = experiment.service_url print( "Setting up your instance and waiting for Jupyter notebook to become available ...", end='') if wait_for_url(jupyter_url, sleep_duration_seconds=2, iterations=900): floyd_logger.info( "\nPath to jupyter notebook: {}".format(jupyter_url)) if open: webbrowser.open(jupyter_url) else: floyd_logger.info( "\nPath to jupyter notebook: {}".format(jupyter_url)) floyd_logger.info( "Notebook is still loading. View logs to track progress") floyd_logger.info(" floyd logs {}".format(expt_info['name'])) # Print the path to serving endpoint if mode == 'serve': floyd_logger.info("Path to service endpoint: {}".format( experiment.service_url)) if experiment.timeout_seconds < 4 * 60 * 60: floyd_logger.info( "\nYour job timeout is currently set to {} seconds".format( experiment.timeout_seconds)) floyd_logger.info( "This is because you are in a trial account. Paid users will have longer timeouts. " "See https://www.floydhub.com/pricing for details") else: floyd_logger.info("To view logs enter:") floyd_logger.info(" floyd logs {}".format(expt_info['name']))
def restart(ctx, job_name, data, open_notebook, env, message, gpu, cpu, gpup, cpup, command): """ Restart a given job as a new job. """ # Error early if more than one --env is passed. Then get the first/only # --env out of the list so all other operations work normally (they don't # expect an iterable). For details on this approach, see the comment above # the --env click option if len(env) > 1: floyd_logger.error( "You passed more than one environment: {}. Please specify a single environment." .format(env)) sys.exit(1) env = env[0] parameters = {} expt_client = ExperimentClient() try: job = expt_client.get(normalize_job_name(job_name)) except FloydException: job = expt_client.get(job_name) if gpup: instance_type = G1P_INSTANCE_TYPE elif cpup: instance_type = C1P_INSTANCE_TYPE elif gpu: instance_type = G1_INSTANCE_TYPE elif cpu: instance_type = C1_INSTANCE_TYPE else: instance_type = job.instance_type if instance_type is not None: parameters['instance_type'] = instance_type else: instance_type = job.instance_type if env is not None: arch = INSTANCE_ARCH_MAP[instance_type] if not validate_env(env, arch): sys.exit(1) parameters['env'] = env success, data_ids = process_data_ids(data) if not success: sys.exit(1) if data_ids: parameters['data_ids'] = data_ids if message: parameters['description'] = message if command: parameters['command'] = ' '.join(command) floyd_logger.info('Restarting job %s...', job_name) new_job_info = expt_client.restart(job.id, parameters=parameters) if not new_job_info: floyd_logger.error("Failed to restart job") sys.exit(1) show_new_job_info(expt_client, new_job_info['name'], new_job_info, job.mode, open_notebook)