def start(self): self._monitor.pull_job_info() self._train_cache = TrainCache(self._monitor.sub_train_job_id, self._redis_host, self._redis_port) self._param_cache = ParamCache(self._monitor.sub_train_job_id, self._redis_host, self._redis_port) logger.info( f'Starting worker for sub train job "{self._monitor.sub_train_job_id}"...' ) self._notify_start() while True: proposal = self._fetch_proposal() if proposal is not None: result = self._perform_trial(proposal) self._submit_result(result) time.sleep(LOOP_SLEEP_SECS)
def start(self): self._monitor.pull_job_info() self._train_cache = TrainCache(self._monitor.sub_train_job_id, self._redis_host, self._redis_port) self._param_cache = ParamCache(self._monitor.sub_train_job_id, self._redis_host, self._redis_port) self._advisor = self._make_advisor() logger.info( f'Starting advisor for sub train job "{self._monitor.sub_train_job_id}"...' ) self._notify_start() while True: self._fetch_results() if not self._make_proposals(): self._notify_budget_reached() break time.sleep(LOOP_SLEEP_SECS)
def start(self): self._monitor.pull_job_info() self._train_cache = TrainCache(self._monitor.sub_train_job_id, self._redis_host, self._redis_port) self._param_cache = ParamCache(self._monitor.sub_train_job_id, self._redis_host, self._redis_port) logger.info( f'Starting worker for sub train job "{self._monitor.sub_train_job_id}"...' ) self._notify_start() # if it's distributed training, skip the adviser if "DIST_TRAIN_MODEL" in os.environ and os.environ["DIST_TRAIN_MODEL"] == "DIST": print("Start dist training") logger.info("Start dist training") pro = Proposal(trial_no=1, knobs={}) model_inst = self._load_model(pro) self._train_model(model_inst, pro, None) result = self._evaluate_model(model_inst, pro) # in master process's container # it's master_addr is "localhost" # only master run the save model if os.environ["MASTER_ADDR"] == "localhost": self._save_model(model_inst, pro, result) else: # training as usual while True: logger.info('Fetching proposal....') proposal = self._fetch_proposal() if proposal is not None: result = self._perform_trial(proposal) self._submit_result(result) time.sleep(LOOP_SLEEP_SECS)
def tune_model( py_model_class: Type[BaseModel], train_dataset_path: str, val_dataset_path: str, annotation_dataset_path: str = None, task: str = None, test_dataset_path: str = None, budget: Budget = None, train_args: Dict[str, any] = None) -> (Dict[str, Any], float, Params): worker_id = 'local' # Note start time start_time = time.time() # Retrieve config of model _print_header('Checking model configuration...') knob_config = py_model_class.get_knob_config() _check_knob_config(knob_config) # Read knob values from CLI args _print_header('Starting trials...') knobs_from_args = _maybe_read_knobs_from_args(knob_config) # Read budget options from CLI args budget_from_args = _maybe_read_budget_from_args() budget = {**(budget or {}), **budget_from_args} inform_user(f'Using budget {budget}...') # Make advisor advisor = make_advisor(knob_config, budget) inform_user(f'Using advisor "{type(advisor).__name__}"...') # Create caches & stores param_store: ParamStore = FileParamStore() param_cache: ParamCache = ParamCache() train_cache: TrainCache = TrainCache() # Variables to track over trials best_model_score = -1 best_trial_no = 0 best_model_test_score = None best_proposal = None best_store_params_id = None # Train worker tells advisor that it is free train_cache.add_worker(worker_id) # Until there's no more proposals, keep conducting trials trial_no = 0 while True: trial_no += 1 # Advisor checks free workers worker_ids = train_cache.get_workers() assert worker_id in worker_ids # Advisor checks worker doesn't already have a proposal proposal = train_cache.get_proposal(worker_id) assert proposal is None # Advisor sends a proposal to worker # Overriding knobs from args proposal = advisor.propose(worker_id, trial_no) if proposal is None: print('No more proposals from advisor - to stop training') break proposal.knobs = {**proposal.knobs, **knobs_from_args} train_cache.create_proposal(worker_id, proposal) # Worker receives proposal proposal = train_cache.get_proposal(worker_id) assert proposal is not None # Worker starts trial _print_header(f'Trial #{trial_no}') print('Proposal from advisor:', proposal) # Worker loads model model_inst = py_model_class(**proposal.knobs) # Worker pulls shared params shared_params = _pull_shared_params(proposal, param_cache) # Worker trains model print('Training model...') if annotation_dataset_path: model_inst.train(train_dataset_path, annotation_dataset_path=annotation_dataset_path, shared_params=shared_params, **(train_args or {})) else: model_inst.train(train_dataset_path, shared_params=shared_params, **(train_args or {})) # Worker evaluates model if annotation_dataset_path: result = _evaluate_model(model_inst, proposal, val_dataset_path, annotation_dataset_path) else: result = _evaluate_model(model_inst, proposal, val_dataset_path) # Worker caches/saves model parameters store_params_id = _save_model(model_inst, proposal, result, param_cache, param_store) # Update best saved model if result.score is not None and store_params_id is not None and result.score > best_model_score: inform_user( 'Best saved model so far! Beats previous best of score {}!'. format(best_model_score)) best_store_params_id = store_params_id best_proposal = proposal best_model_score = result.score best_trial_no = trial_no # Test best model, if test dataset provided if test_dataset_path is not None: print('Evaluating new best model on test dataset...') if annotation_dataset_path: best_model_test_score = model_inst.evaluate( test_dataset_path, annotation_dataset_path=annotation_dataset_path) else: best_model_test_score = model_inst.evaluate( test_dataset_path) inform_user( 'Score on test dataset: {}'.format(best_model_test_score)) # Worker sends result to advisor print('Giving feedback to advisor...') train_cache.create_result(worker_id, result) train_cache.delete_proposal(worker_id) # Advisor receives result # Advisor ingests feedback result = train_cache.take_result(worker_id) assert result is not None advisor.feedback(worker_id, result) # Destroy model model_inst.destroy() if task == 'question_answering_covid19': break # Train worker tells advisor that it is no longer free train_cache.delete_worker(worker_id) # Declare best model if best_proposal is not None: inform_user('Best trial #{} has knobs {} with score of {}'.format( best_trial_no, best_proposal.knobs, best_model_score)) if best_model_test_score is not None: inform_user( '...with test score of {}'.format(best_model_test_score)) # Load params for best model best_params = None if best_store_params_id is not None: best_params = param_store.load(best_store_params_id) # Teardown model class print('Running model class teardown...') py_model_class.teardown() # Print duration duration = time.time() - start_time print('Tuning took a total of {}s'.format(duration)) return (best_proposal, best_model_test_score, best_params)