def stream_job_log(job): ids_to_name = {job.id: job.short_id} url = '%s/ws/jobs/%s/stream' % (get_stream_url(), job.id) meta = {"job_id": job.short_id} if util.is_tensorboard_job(job): meta["tensorboard_job"] = job LogPrinter(url, ids_to_name, stream_meta=meta).stream()
def stream_experiment_log(experiment, filter_job=None): def add_experiment_to_log(experiment): ids_to_name[experiment.id] = experiment.short_id for job in experiment.jobs: ids_to_name[job.id] = '{}.{}'.format(experiment.short_id, job.name) url = '%s/ws/experiments/%s/stream' % (get_stream_url(), experiment.id) ids_to_name = {} jobs = experiment.jobs add_experiment_to_log(experiment) for child_experiment in experiment.children: jobs += child_experiment.jobs add_experiment_to_log(child_experiment) horovod_gpus = max( [job.gpus for job in jobs if job.role == 'tf-hrvd-master'] + [0]) horovod_processes = experiment.framework_config or {} horovod_processes = horovod_processes.get('horovod', {}) or {} horovod_processes = horovod_processes.get('workers', {}) or {} horovod_processes = horovod_processes.get('processes', None) meta = { "experiment_id": experiment.short_id, "filter_job": filter_job, "horovod_processes": horovod_processes or horovod_gpus or 1 } if util.has_tensorboard(experiment): meta["tensorboard_job"] = util.tensorboard_job(experiment) LogPrinter(url, ids_to_name, stream_meta=meta).stream()
def monitor_experiment(experiment, detailed=False, stream_meta={}): url = '%s/ws/experiments/%s/monitor' % (get_stream_url(), experiment.id) jobs = get_experiment_jobs(experiment) monitor_jobs(url, experiment.project, jobs, detailed=detailed, stream_meta={"experiment_id": experiment.short_id})
def main(): parser = argparse.ArgumentParser() parser.add_argument('-v', help="show endpoints", action='store_const', const=True) parser.add_argument('--version', '-V', help="show version", action='version', version='RiseML CLI {}'.format(VERSION)) subparsers = parser.add_subparsers() # user ops add_whoami_parser(subparsers) add_user_parser(subparsers) # system ops add_system_parser(subparsers) add_account_parser(subparsers) # data ops add_ls_parser(subparsers) add_cp_parser(subparsers) add_rm_parser(subparsers) # worklow ops add_init_parser(subparsers) add_train_parser(subparsers) #add_exec_parser(subparsers) add_monitor_parser(subparsers) #add_deploy_parser(subparsers) add_logs_parser(subparsers) add_kill_parser(subparsers) add_status_parser(subparsers) args = parser.parse_args(sys.argv[1:]) if args.v: print('api_url: %s' % get_api_url()) print('sync_url: %s' % get_sync_url()) print('stream_url: %s' % get_stream_url()) print('git_url: %s' % get_git_url()) if hasattr(args, 'run'): if not (config_file_exists() or args.run.__name__ == 'run_login'): handle_error('Client configuration file %s not found' % get_config_file()) try: args.run(args) except HTTPError as e: # all uncaught http errors goes here handle_error(str(e)) except KeyboardInterrupt: print('\nAborting...') else: parser.print_usage()
def stream_experiment_log(experiment): def add_experiment_to_log(experiment): ids_to_name[experiment.id] = experiment.short_id for job in experiment.jobs: ids_to_name[job.id] = '{}.{}'.format(experiment.short_id, job.name) url = '%s/ws/experiments/%s/stream' % (get_stream_url(), experiment.id) ids_to_name = {} add_experiment_to_log(experiment) for child_experiment in experiment.children: add_experiment_to_log(child_experiment) meta = {"experiment_id": experiment.short_id} if util.has_tensorboard(experiment): meta["tensorboard_job"] = util.tensorboard_job(experiment) LogPrinter(url, ids_to_name, stream_meta=meta).stream()
def monitor_job(job, detailed=False): url = '%s/ws/jobs/%s/monitor' % (get_stream_url(), job.id) monitor_jobs(url, job.project, [job], detailed=detailed, stream_meta={"job_id": job.short_id})