def rollout(self, params, render=False): """ Execute a rollout and returns minus cumulative reward. Load :params: into the controller and execute a single rollout. This is the main API of this class. :args params: parameters as a single 1D np array :returns: minus cumulative reward """ # copy params into the controller if params is not None: load_parameters(params, self.controller) obs = self.env.reset() hidden = [torch.zeros(1, RSIZE).to(self.device) for _ in range(2)] cumulative = 0 i = 0 while True: obs = transform(obs.transpose(1, 2, 0)).unsqueeze(0).to(self.device) action, hidden = self.get_action_and_transition(obs, hidden) action = action.item() if action < .33 and action > -.33: # Don't move. action = np.array([False, False]) elif action < -.33: # Move left. action = np.array([True, False]) else: # Move right. action = np.array([False, True]) obs, reward, done, _ = self.env.step(action) if render: self.env.render() cumulative += reward if done or i > self.time_limit: return -cumulative i += 1
def load_ckpt(model, optimizer, ckpt_path, load_model=False, load_opt=False, load_misc=False, is_freezeD=False): ckpt = torch.load(ckpt_path, map_location=lambda storage, loc: storage) if load_model: if is_freezeD: mismatch_names = misc.load_parameters(src=ckpt["state_dict"], dst=model.state_dict(), strict=False) print("The following parameters/buffers do not match with the ones of the pre-trained model:", mismatch_names) else: model.load_state_dict(ckpt["state_dict"], strict=False) if load_opt: optimizer.load_state_dict(ckpt["optimizer"]) for state in optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.cuda() if load_misc: seed = ckpt["seed"] run_name = ckpt["run_name"] step = ckpt["step"] try: aa_p = ckpt["aa_p"] except: aa_p = ckpt["ada_p"] best_step = ckpt["best_step"] best_fid = ckpt["best_fid"] try: epoch = ckpt["epoch"] except: epoch = 0 try: topk = ckpt["topk"] except: topk = "initialize" try: best_ckpt_path = ckpt["best_fid_checkpoint_path"] except: best_ckpt_path = ckpt["best_fid_ckpt"] return seed, run_name, step, epoch, topk, aa_p, best_step, best_fid, best_ckpt_path
def train_explorer(logdir, epochs=10, n_samples=4, pop_size=4, display=True, max_workers=10): results = {} results['best'] = [] # multiprocessing variables num_workers = min(max_workers, n_samples * pop_size) time_limit = 1000 # create tmp dir if non existent and clean it if existent tmp_dir = join(logdir, 'tmp_exp') if not exists(tmp_dir): mkdir(tmp_dir) else: for fname in listdir(tmp_dir): unlink(join(tmp_dir, fname)) # create exp dir if non exitent explore_dir = join(logdir, 'explore') if not exists(explore_dir): mkdir(explore_dir) ################################################################################ # Thread routines # ################################################################################ def slave_routine(p_queue, r_queue, e_queue, p_index): """ Thread routine. Threads interact with p_queue, the parameters queue, r_queue, the result queue and e_queue the end queue. They pull parameters from p_queue, execute the corresponding rollout, then place the result in r_queue. Each parameter has its own unique id. Parameters are pulled as tuples (s_id, params) and results are pushed as (s_id, result). The same parameter can appear multiple times in p_queue, displaying the same id each time. As soon as e_queue is non empty, the thread terminate. When multiple gpus are involved, the assigned gpu is determined by the process index p_index (gpu = p_index % n_gpus). :args p_queue: queue containing couples (s_id, parameters) to evaluate :args r_queue: where to place results (s_id, results) :args e_queue: as soon as not empty, terminate :args p_index: the process index """ # init routine gpu = p_index % torch.cuda.device_count() device = torch.device( 'cuda:{}'.format(gpu) if torch.cuda.is_available() else 'cpu') # redirect streams sys.stdout = open(join(tmp_dir, str(getpid()) + '.out'), 'a') sys.stderr = open(join(tmp_dir, str(getpid()) + '.err'), 'a') # with torch.no_grad(): # r_gen = RolloutGenerator(logdir, device, time_limit) # while e_queue.empty(): # if p_queue.empty(): # sleep(.1) # else: # s_id, params = p_queue.get() # r_queue.put((s_id, r_gen.rollout(params))) with torch.no_grad(): r_gen = RolloutGenerator(logdir, device, time_limit) while e_queue.empty(): if p_queue.empty(): sleep(.1) else: s_id, params = p_queue.get() r_queue.put((s_id, r_gen.rollout(params))) ################################################################################ # Define queues and start workers # ################################################################################ p_queue = Queue() r_queue = Queue() e_queue = Queue() for p_index in range(num_workers): Process(target=slave_routine, args=(p_queue, r_queue, e_queue, p_index)).start() ################################################################################ # Evaluation # ################################################################################ def evaluate(solutions, results, rollouts=100): """ Give current controller evaluation. Evaluation is minus the cumulated reward averaged over rollout runs. :args solutions: CMA set of solutions :args results: corresponding results :args rollouts: number of rollouts :returns: minus averaged cumulated reward """ index_min = np.argmin(results) best_guess = solutions[index_min] restimates = [] for s_id in range(rollouts): p_queue.put((s_id, best_guess)) print("Evaluating...") for _ in tqdm(range(rollouts)): while r_queue.empty(): sleep(.1) restimates.append(r_queue.get()[1]) return best_guess, np.mean(restimates), np.std(restimates) ################################################################################ # Launch CMA # ################################################################################ controller = Controller(LSIZE, RSIZE, ASIZE) # dummy instance # define current best and load parameters cur_best = None ctrl_file = join(explore_dir, 'best.tar') print("Attempting to load previous best...") if exists(ctrl_file): state = torch.load(ctrl_file, map_location={'cuda:0': 'cpu'}) cur_best = -state['reward'] controller.load_state_dict(state['state_dict']) print("Previous best was {}...".format(-cur_best)) parameters = controller.parameters() es = cma.CMAEvolutionStrategy(flatten_parameters(parameters), 0.1, {'popsize': pop_size}) epoch = 0 log_step = 3 while not es.stop(): if cur_best is not None and -cur_best > target_return: print("Already better than target, breaking...") break r_list = [0] * pop_size # result list solutions = es.ask() # push parameters to queue for s_id, s in enumerate(solutions): for _ in range(n_samples): p_queue.put((s_id, s)) # retrieve results if display: pbar = tqdm(total=pop_size * n_samples) for _ in range(pop_size * n_samples): while r_queue.empty(): sleep(.1) r_s_id, r = r_queue.get() r_list[r_s_id] += r / n_samples if display: pbar.update(1) if display: pbar.close() es.tell(solutions, r_list) es.disp() # evaluation and saving if epoch % log_step == log_step - 1: best_params, best, std_best = evaluate(solutions, r_list) # log the best results['best'].append(best) print("Current evaluation: {}".format(best)) if not cur_best or cur_best > best: cur_best = best print("Saving new best with value {}+-{}...".format( -cur_best, std_best)) load_parameters(best_params, controller) torch.save( { 'epoch': epoch, 'reward': -cur_best, 'state_dict': controller.state_dict() }, join(explore_dir, 'best.tar')) if -best > target_return: print( "Terminating controller training with value {}...".format( best)) break epoch += 1 es.result_pretty() e_queue.put('EOP') return results
pbar.update(1) if args.display: pbar.close() count_g += 1 es.tell(solutions, r_list) es.disp() # evaluation and saving if epoch % log_step == log_step - 1: best_params, best, std_best = evaluate(solutions, r_list) print("Current evaluation: {}".format(best)) if not cur_best or cur_best > best: cur_best = best print("Saving new best with value {}+-{}...".format( -cur_best, std_best)) load_parameters(best_params, controller) torch.save( { 'epoch': epoch, 'reward': -cur_best, 'state_dict': controller.state_dict() }, join(ctrl_dir, 'best.tar')) if -best > args.target_return: print("Terminating controller training with value {}...".format( best)) break torch.save( { 'epoch': epoch, 'reward': -cur_best, 'state_dict': controller.state_dict()
if args.display: pbar.update(1) if args.display: pbar.close() es.tell(solutions, r_list) es.disp() # evaluation and saving if epoch % log_step == log_step - 1: best_params, best, std_best = evaluate(solutions, r_list) print("Current evaluation: {}".format(best)) if not cur_best or cur_best > best: cur_best = best print("Saving new best with value {}+-{}...".format(-cur_best, std_best)) load_parameters(best_params, controller) torch.save( {'epoch': epoch, 'reward': - cur_best, 'state_dict': controller.state_dict()}, join(ctrl_dir, 'best.tar')) if - best > args.target_return: print("Terminating controller training with value {}...".format(best)) break epoch += 1 es.result_pretty() e_queue.put('EOP')
def run(args): p_queue = Queue() r_queue = Queue() e_queue = Queue() latent = 32 mixture = 256 size = latent + mixture controller = Controller(size, 3) for i in range(args.max_workers): Process(target=slave_routine, args=(p_queue, r_queue, e_queue, i, args.logdir)).start() cur_best = None savefile = args.logdir/'best.tar' if savefile.exists(): print(f'Loading from {savefile}') state = torch.load(savefile.as_posix(), map_location={'cuda:0': 'cpu'}) cur_best = -state['reward'] controller.load_state_dict(state['state_dict']) parameters = controller.parameters() sigma = 0.1 es = cma.CMAEvolutionStrategy(flatten_parameters(parameters), sigma, {'popsize': args.pop_size}) epoch = 0 while not es.stop(): if cur_best is not None and -cur_best > args.target_return: print('Already better than target, breaking...') break r_list = [0] * args.pop_size # result list solutions = es.ask() # push parameters to queue for s_id, s in enumerate(solutions): for _ in range(args.n_samples): p_queue.put((s_id, s)) # Retrieve results if args.display: pbar = tqdm(total=args.pop_size * args.n_samples) for _ in range(args.pop_size * args.n_samples): while r_queue.empty(): sleep(.1) r_s_id, r = r_queue.get() r_list[r_s_id] += r / args.n_samples if args.display: pbar.update(1) if args.display: pbar.close() es.tell(solutions, r_list) es.disp() # CMA-ES seeks to minimize, so we want to multiply the reward we # get in a rollout by -1. best_params, best, std_best = evaluate(solutions, r_list, p_queue, r_queue) if (not cur_best) or (cur_best > best): cur_best = best print(f'Saving new best with value {-cur_best}+{-std_best}') load_parameters(best_params, controller) torch.save({'epoch': epoch, 'reward': -cur_best, 'state_dict': controller.state_dict()}, savefile) # Save after every epoch torch.save(controller.state_dict(), f'{controller_pt}') if -best > args.target_return: print(f'Terminating controller training with value {best}...') break epoch += 1 es.result_pretty() e_queue.put('EOP')
def controller_train_proc(ctrl_dir, controller, vae, mdrnn, target_return=950, skip_train=False, display=True): step_log('4-2. controller_train_proc START!!') # define current best and load parameters cur_best = None if not os.path.exists(ctrl_dir): os.mkdir(ctrl_dir) ctrl_file = os.path.join(ctrl_dir, 'best.tar') p_queue = Queue() r_queue = Queue() #e_queue = Queue() # pipaek : not necessary if not multiprocessing print("Attempting to load previous best...") if os.path.exists(ctrl_file): #state = torch.load(ctrl_file, map_location={'cuda:0': 'cpu'}) state = torch.load(ctrl_file) cur_best = -state['reward'] controller.load_state_dict(state['state_dict']) print("Previous best was {}...".format(-cur_best)) if skip_train: return # pipaek : 트레이닝을 통한 모델 개선을 skip하고 싶을 때.. def evaluate(solutions, results, rollouts=100): # pipaek : rollout 100 -> 10 , originally 100 """ Give current controller evaluation. Evaluation is minus the cumulated reward averaged over rollout runs. :args solutions: CMA set of solutions :args results: corresponding results :args rollouts: number of rollouts :returns: minus averaged cumulated reward """ index_min = np.argmin(results) best_guess = solutions[index_min] restimates = [] for s_id in range(rollouts): print('p_queue.put(), s_id=%d' % s_id) p_queue.put((s_id, best_guess)) print('>>>rollout_routine!!') rollout_routine() # pipaek : 여기서도 p_queue.put 하자마자 바로 처리.. print(">>>Evaluating...") for _ in tqdm(range(rollouts)): #while r_queue.empty(): # sleep(.1) # pipaek : multi-process가 아니므로 if not r_queue.empty( ): # pipaek : 20180718 r_queue.get()에서 stuck되어 있는 것을 방지하기 위해 체크!! #print('r_queue.get()') #restimates.append(r_queue.get()[1]) r_s_id, r = r_queue.get() print( 'in evaluate r_queue.get() r_s_id=%d, r_queue remain=%d' % (r_s_id, r_queue.qsize())) restimates.append(r) else: print('r_queue.empty() -> break!!') break return best_guess, np.mean(restimates), np.std(restimates) def rollout_routine(): """ Thread routine. Threads interact with p_queue, the parameters queue, r_queue, the result queue and e_queue the end queue. They pull parameters from p_queue, execute the corresponding rollout, then place the result in r_queue. Each parameter has its own unique id. Parameters are pulled as tuples (s_id, params) and results are pushed as (s_id, result). The same parameter can appear multiple times in p_queue, displaying the same id each time. As soon as e_queue is non empty, the thread terminate. When multiple gpus are involved, the assigned gpu is determined by the process index p_index (gpu = p_index % n_gpus). :args p_queue: queue containing couples (s_id, parameters) to evaluate :args r_queue: where to place results (s_id, results) :args e_queue: as soon as not empty, terminate :args p_index: the process index """ # init routine #gpu = p_index % torch.cuda.device_count() #device = torch.device('cuda:{}'.format(gpu) if torch.cuda.is_available() else 'cpu') # redirect streams #if not os.path.exists(tmp_dir): # os.mkdir(tmp_dir) #sys.stdout = open(os.path.join(tmp_dir, 'rollout.out'), 'a') #sys.stderr = open(os.path.join(tmp_dir, 'rollout.err'), 'a') with torch.no_grad(): r_gen = RolloutGenerator(vae, mdrnn, controller, device, rollout_time_limit) while not p_queue.empty(): print('in rollout_routine, p_queue.get()') s_id, params = p_queue.get() print('r_queue.put() sid=%d' % s_id) r_queue.put((s_id, r_gen.rollout(params))) print('r_gen.rollout OK, r_queue.put()') #r_queue.qsize() parameters = controller.parameters() es = cma.CMAEvolutionStrategy(flatten_parameters(parameters), 0.1, {'popsize': C_POP_SIZE}) print("CMAEvolutionStrategy start OK!!") epoch = 0 log_step = 3 while not es.stop(): print("--------------------------------------") print("CURRENT EPOCH = %d" % epoch) if cur_best is not None and -cur_best > target_return: print("Already better than target, breaking...") break r_list = [0] * C_POP_SIZE # result list solutions = es.ask() print("CMAEvolutionStrategy-ask") # push parameters to queue for s_id, s in enumerate( solutions): # pipaek : 이 for가 C_POP_SIZE 만큼 반복된다. #for _ in range(C_POP_SIZE * C_N_SAMPLES): for _ in range(C_N_SAMPLES): print('in controller_train_proc p_queue.put() s_id : %d' % s_id) p_queue.put((s_id, s)) #print("p_queue.put %d" % s_id) rollout_routine( ) # pipaek : p_queue.put 하자마자 바로 get해서 rollout하고 나서 r_queue에 결과 입력. print("rollout_routine OK, r_queue size=%d" % r_queue.qsize()) # retrieve results if display: pbar = tqdm(total=C_POP_SIZE * C_N_SAMPLES) #for idx in range(C_POP_SIZE * C_N_SAMPLES): while not r_queue.empty( ): # pipaek : 20180718 여기서 r_queue.get을 못해서 영원히 걸려있는 상태를 방지하기 위해 for문을 while문으로 바꾼다. #while r_queue.empty(): # sleep(.1) try: r_s_id, r = r_queue.get() print( 'in controller_train_proc r_queue.get() r_s_id=%d, r_queue remain=%d' % (r_s_id, r_queue.qsize())) r_list[r_s_id] += r / C_N_SAMPLES if display: pbar.update(1) except IndexError as err: print('IndexError during r_queue.get()') print('cur r_list size:%d, index:%d' % (len(r_list), r_s_id)) if display: pbar.close() es.tell(solutions, r_list) # pipaek : solution array에다가 r_list 결과를 업데이트.. es.disp() # evaluation and saving if epoch % log_step == log_step - 1: print(">>>> TRYING EVALUATION, CURRENT EPOCH = %d" % epoch) best_params, best, std_best = evaluate( solutions, r_list, rollouts=100 ) # pipaek : evaluate을 위해서 rollout은 10번만 하자.. originally 100 print("Current evaluation: {}".format(best)) if not cur_best or cur_best > best: cur_best = best print("Saving new best with value {}+-{}...".format( -cur_best, std_best)) load_parameters(best_params, controller) torch.save( { 'epoch': epoch, 'reward': -cur_best, 'state_dict': controller.state_dict() }, os.path.join(ctrl_dir, 'best.tar')) if -best > target_return: print( "Terminating controller training with value {}...".format( best)) break epoch += 1 print("es.stop!!") es.result_pretty()