def test_alpgmm(env, nb_episodes, gif=True, nb_dims=2, score_step=1000, verbose=True, params={}): # Init teacher task_generator = ALPGMM([0] * nb_dims, [1] * nb_dims, params=params) # Init book keeping rewards = [] scores = [] bk = { 'weights': [], 'covariances': [], 'means': [], 'tasks_lps': [], 'episodes': [], 'comp_grids': [], 'comp_xs': [], 'comp_ys': [] } # Launch run for i in range(nb_episodes + 1): if (i % score_step) == 0: scores.append(env.get_score()) if nb_dims == 2: if verbose: print(env.cube_competence) else: if verbose: print("it:{}, score:{}".format(i, scores[-1])) # Book keeping if ALP-GMM updated its GMM if i > 100 and (i % task_generator.fit_rate) == 0 and (gif is True): bk['weights'].append(task_generator.gmm.weights_.copy()) bk['covariances'].append(task_generator.gmm.covariances_.copy()) bk['means'].append(task_generator.gmm.means_.copy()) bk['tasks_lps'] = task_generator.tasks_alps bk['episodes'].append(i) if nb_dims == 2: bk['comp_grids'].append(env.cube_competence.copy()) bk['comp_xs'].append(env.bnds[0].copy()) bk['comp_ys'].append(env.bnds[1].copy()) task = task_generator.sample_task() reward = env.episode(task) task_generator.update(np.array(task), reward) rewards.append(reward) if gif and nb_dims == 2: print('Creating gif...') gmm_plot_gif(bk, gifname='alpgmm_' + str(time.time()), gifdir='toy_env_gifs/') print('Done (see graphics/toy_env_gifs/ folder)') return scores
class ALPGMMTeacher(gym.Wrapper): def __init__(self, env, **kwargs): from teachDRL.teachers.algos.alp_gmm import ALPGMM super(ALPGMMTeacher, self).__init__(env) self.cond_bounds = self.env.unwrapped.cond_bounds self.midep_trgs = False env_param_lw_bounds = [self.cond_bounds[k][0] for k in self.usable_metrics] env_param_hi_bounds = [self.cond_bounds[k][1] for k in self.usable_metrics] self.alp_gmm = ALPGMM(env_param_lw_bounds, env_param_hi_bounds) self.trg_vec = None self.trial_reward = 0 self.n_trial_steps = 0 def reset(self): if self.trg_vec is not None: if self.n_trial_steps == 0: # This is some whack shit that happens when we reset manually from the inference script. rew = 0 else: rew = self.trial_reward / self.n_trial_steps self.alp_gmm.update(self.trg_vec, rew) trg_vec = self.alp_gmm.sample_task() self.trg_vec = trg_vec trgs = {k: trg_vec[i] for (i, k) in enumerate(self.usable_metrics)} # print(trgs) self.set_trgs(trgs) self.trial_reward = 0 self.n_trial_steps = 0 return self.env.reset() def step(self, action): obs, rew, done, info = self.env.step(action) self.trial_reward += rew self.n_trial_steps += 1 return obs, rew, done, info
def main(): import random import gym_micropolis import game_of_life args = get_args() args.log_dir = args.save_dir + '/logs' assert args.algo in ['a2c', 'ppo', 'acktr'] if args.recurrent_policy: assert args.algo in ['a2c', 'ppo'], \ 'Recurrent policy is not implemented for ACKTR' args.poet = True # hacky num_updates = int(args.num_frames) // args.num_steps // args.num_processes torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) graph_name = args.save_dir.split('trained_models/')[1].replace('/', ' ') actor_critic = False agent = False past_steps = 0 try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: if args.overwrite: os.remove(f) else: pass torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None win_eval = None if 'GameOfLife' in args.env_name: print('env name: {}'.format(args.env_name)) num_actions = 1 envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False, None, args=args) if isinstance(envs.observation_space, gym.spaces.Discrete): num_inputs = envs.observation_space.n elif isinstance(envs.observation_space, gym.spaces.Box): if len(envs.observation_space.shape) == 3: in_w = envs.observation_space.shape[1] in_h = envs.observation_space.shape[2] else: in_w = 1 in_h = 1 num_inputs = envs.observation_space.shape[0] if isinstance(envs.action_space, gym.spaces.Discrete): out_w = 1 out_h = 1 if 'Micropolis' in args.env_name: #otherwise it's set if args.power_puzzle: num_actions = 1 else: num_actions = 19 # TODO: have this already from env elif 'GameOfLife' in args.env_name: num_actions = 1 else: num_actions = envs.action_space.n elif isinstance(envs.action_space, gym.spaces.Box): if len(envs.action_space.shape) == 3: out_w = envs.action_space.shape[1] out_h = envs.action_space.shape[2] elif len(envs.action_space.shape) == 1: out_w = 1 out_h = 1 num_actions = envs.action_space.shape[-1] print('num actions {}'.format(num_actions)) if args.auto_expand: args.n_recs -= 1 actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'map_width': args.map_width, 'num_actions': num_actions, 'recurrent': args.recurrent_policy, 'prebuild': args.prebuild, 'in_w': in_w, 'in_h': in_h, 'num_inputs': num_inputs, 'out_w': out_w, 'out_h': out_h }, curiosity=args.curiosity, algo=args.algo, model=args.model, args=args) if args.auto_expand: args.n_recs += 1 evaluator = None if not agent: agent = init_agent(actor_critic, args) #saved_model = os.path.join(args.save_dir, args.env_name + '.pt') if args.load_dir: saved_model = os.path.join(args.load_dir, args.env_name + '.tar') else: saved_model = os.path.join(args.save_dir, args.env_name + '.tar') vec_norm = get_vec_normalize(envs) alp_gmm = None if os.path.exists(saved_model) and not args.overwrite: checkpoint = torch.load(saved_model) saved_args = checkpoint['args'] actor_critic.load_state_dict(checkpoint['model_state_dict']) actor_critic.to(device) actor_critic.cuda() #agent = init_agent(actor_critic, saved_args) agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) if args.auto_expand: if not args.n_recs - saved_args.n_recs == 1: print( 'can expand by 1 rec only from saved model, not {}'.format( args.n_recs - saved_args.n_recs)) raise Exception actor_critic.base.auto_expand() print('expanded net: \n{}'.format(actor_critic.base)) past_steps = checkpoint['past_steps'] ob_rms = checkpoint['ob_rms'] past_steps = next(iter( agent.optimizer.state_dict()['state'].values()))['step'] print('Resuming from step {}'.format(past_steps)) #print(type(next(iter((torch.load(saved_model)))))) #actor_critic, ob_rms = \ # torch.load(saved_model) #agent = \ # torch.load(os.path.join(args.save_dir, args.env_name + '_agent.pt')) #if not agent.optimizer.state_dict()['state'].values(): # past_steps = 0 #else: # raise Exception alp_gmm = checkpoint['alp_gmm'] if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms saved_args.num_frames = args.num_frames saved_args.vis_interval = args.vis_interval saved_args.eval_interval = args.eval_interval saved_args.overwrite = args.overwrite saved_args.n_recs = args.n_recs saved_args.intra_shr = args.intra_shr saved_args.inter_shr = args.inter_shr saved_args.map_width = args.map_width saved_args.render = args.render saved_args.print_map = args.print_map saved_args.load_dir = args.load_dir saved_args.experiment_name = args.experiment_name saved_args.log_dir = args.log_dir saved_args.save_dir = args.save_dir saved_args.num_processes = args.num_processes saved_args.n_chan = args.n_chan saved_args.prebuild = args.prebuild args = saved_args actor_critic.to(device) if 'LSTM' in args.model: recurrent_hidden_state_size = actor_critic.base.get_recurrent_state_size( ) else: recurrent_hidden_state_size = actor_critic.recurrent_hidden_state_size if args.curiosity: rollouts = CuriosityRolloutStorage( args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, recurrent_hidden_state_size, actor_critic.base.feature_state_size(), args=args) else: rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, recurrent_hidden_state_size, args=args) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() model = actor_critic.base reset_eval = False plotter = None if args.model == 'FractalNet' or args.model == 'fractal': n_cols = model.n_cols if args.rule == 'wide1' and args.n_recs > 3: col_step = 3 else: col_step = 1 else: n_cols = 0 col_step = 1 env_param_bounds = envs.venv.venv.get_param_bounds() envs.venv.venv.set_param_ranges(env_param_bounds) num_env_params = len(env_param_bounds) env_param_ranges = [abs(v[1] - v[0]) for k, v in env_param_bounds.items()] env_param_lw_bounds = [v[0] for k, v in env_param_bounds.items()] env_param_hi_bounds = [v[1] for k, v in env_param_bounds.items()] if alp_gmm is None: alp_gmm = ALPGMM(env_param_lw_bounds, env_param_hi_bounds) params_vec = alp_gmm.sample_task() params = OrderedDict() print('\n env_param_bounds', env_param_bounds) print(params_vec) trial_remaining = args.max_step trial_reward = 0 for j in range(past_steps, num_updates): if trial_remaining == 0: trial_reward = trial_reward / args.num_processes alp_gmm.update(params_vec, trial_reward) trial_reward = 0 trial_remaining = args.max_step # sample random environment parameters params_vec = alp_gmm.sample_task() prm_i = 0 for k, v in env_param_bounds.items(): params[k] = params_vec[prm_i] prm_i += 1 envs.venv.venv.set_params(params) trial_remaining -= args.num_steps if reset_eval: print('post eval reset') obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) reset_eval = False #if np.random.rand(1) < 0.1: # envs.venv.venv.remotes[1].send(('setRewardWeights', None)) if args.model == 'FractalNet' and args.drop_path: #if args.intra_shr and args.inter_shr: # n_recs = np.randint # model.set_n_recs() model.set_drop_path() if args.model == 'fixed' and model.RAND: model.num_recursions = random.randint(1, model.map_width * 2) player_act = None for step in range(args.num_steps): # Sample actions with torch.no_grad(): if args.render: if args.num_processes == 1: if not ('Micropolis' in args.env_name or 'GameOfLife' in args.env_name): envs.venv.venv.render() else: pass else: if not ('Micropolis' in args.env_name or 'GameOfLife' in args.env_name): envs.render() envs.venv.venv.render() else: pass #envs.venv.venv.remotes[0].send(('render', None)) #envs.venv.venv.remotes[0].recv() value, action, action_log_probs, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step], player_act=player_act, icm_enabled=args.curiosity, deterministic=False) # Observe reward and next obs obs, reward, done, infos = envs.step(action) player_act = None if args.render: if infos[0]: if 'player_move' in infos[0].keys(): player_act = infos[0]['player_move'] if args.curiosity: # run icm with torch.no_grad(): feature_state, feature_state_pred, action_dist_pred = actor_critic.icm_act( (rollouts.obs[step], obs, action_bin)) intrinsic_reward = args.eta * ( (feature_state - feature_state_pred).pow(2)).sum() / 2. if args.no_reward: reward = 0 reward += intrinsic_reward.cpu() for info in infos: if 'episode' in info.keys(): epi_reward = info['episode']['r'] episode_rewards.append(epi_reward) trial_reward += epi_reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) if args.curiosity: rollouts.insert(obs, recurrent_hidden_states, action, action_log_probs, value, reward, masks, feature_state, feature_state_pred, action_bin, action_dist_pred) else: rollouts.insert(obs, recurrent_hidden_states, action, action_log_probs, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.curiosity: value_loss, action_loss, dist_entropy, fwd_loss, inv_loss = agent.update( rollouts) else: value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() total_num_steps = (j + 1) * args.num_processes * args.num_steps if not dist_entropy: dist_entropy = 0 if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n \ dist entropy {:.1f}, val/act loss {:.1f}/{:.1f},".format( j, total_num_steps, int((total_num_steps - past_steps * args.num_processes * args.num_steps) / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if args.curiosity: print("fwd/inv icm loss {:.1f}/{:.1f}\n".format( fwd_loss, inv_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): if evaluator is None: evaluator = Evaluator(args, actor_critic, device, envs=envs, vec_norm=vec_norm) model = evaluator.actor_critic.base col_idx = [-1, *range(0, n_cols, col_step)] for i in col_idx: evaluator.evaluate(column=i) #num_eval_frames = (args.num_frames // (args.num_steps * args.eval_interval * args.num_processes)) * args.num_processes * args.max_step # making sure the evaluator plots the '-1'st column (the overall net) if args.vis: #and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win_eval = evaluator.plotter.visdom_plot( viz, win_eval, evaluator.eval_log_dir, graph_name, args.algo, args.num_frames, n_graphs=col_idx) except IOError: pass #elif args.model == 'fixed' and model.RAND: # for i in model.eval_recs: # evaluator.evaluate(num_recursions=i) # win_eval = visdom_plot(viz, win_eval, evaluator.eval_log_dir, graph_name, # args.algo, args.num_frames, n_graphs=model.eval_recs) #else: # evaluator.evaluate(column=-1) # win_eval = visdom_plot(viz, win_eval, evaluator.eval_log_dir, graph_name, # args.algo, args.num_frames) reset_eval = True if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic ob_rms = getattr(get_vec_normalize(envs), 'ob_rms', None) save_model = copy.deepcopy(actor_critic) save_agent = copy.deepcopy(agent) if args.cuda: save_model.cpu() optim_save = save_agent.optimizer.state_dict() # experimental: torch.save( { 'past_steps': next(iter(agent.optimizer.state_dict()['state'].values())) ['step'], 'model_state_dict': save_model.state_dict(), 'optimizer_state_dict': optim_save, 'ob_rms': ob_rms, 'args': args, 'alp_gmm': alp_gmm }, os.path.join(save_path, args.env_name + ".tar")) #save_model = [save_model, # getattr(get_vec_normalize(envs), 'ob_rms', None)] #torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) #save_agent = copy.deepcopy(agent) #torch.save(save_agent, os.path.join(save_path, args.env_name + '_agent.pt')) #torch.save(actor_critic.state_dict(), os.path.join(save_path, args.env_name + "_weights.pt")) if args.vis and j % args.vis_interval == 0: if plotter is None: plotter = Plotter(n_cols, args.log_dir, args.num_processes) try: # Sometimes monitor doesn't properly flush the outputs win = plotter.visdom_plot(viz, win, args.log_dir, graph_name, args.algo, args.num_frames) except IOError: pass
class AGAIN(): def __init__(self, mins, maxs, seed=None, params=dict()): self.seed = seed if not seed: self.seed = np.random.randint(42, 424242) np.random.seed(self.seed) # Task space boundaries self.mins = np.array(mins) self.maxs = np.array(maxs) self.classroom_filename = "student_history" if "classroom_filename" not in params else params[ 'classroom_filename'] self.classroom_portion = 100 if "classroom_portion" not in params else params[ 'classroom_portion'] self.use_alpgmm = False if "use_alpgmm" not in params else params[ 'use_alpgmm'] self.pre_test_epoch_idx = 2 if "pretrain_epochs" not in params else params[ 'pretrain_epochs'] self.restart_after_pretrain = False if "restart_after_pretrain" not in params else params[ 'restart_after_pretrain'] self.k = 5 if "k" not in params else params['k'] self.random_expert = False if "random_expert" not in params else params[ 'random_expert'] self.nb_test_epochs = 0 self.use_ground_truth = False if 'use_ground_truth' not in params else params[ 'use_ground_truth'] self.is_toy_env = False if "is_toy_env" not in params else params[ 'is_toy_env'] self.current_student_params = params['student_params'] #self.decorelate_alpgmm = False if "decorelate_alpgmm" not in params else params['decorelate_alpgmm'] self.nb_alpgmm_gaussians = None # setting up alpgmm for pre-test phase self.alpgmm = ALPGMM(mins, maxs, seed=seed, params=params) self.is_new_alpgmm = False # boolean used to track alpgmm's periodic updates self.random_task_ratio = 0.1 self.post_pre_test_task_ratio = 0.02 if "random_task_ratio" not in params else params[ "random_task_ratio"] self.in_end_rnd = self.post_pre_test_task_ratio if 'in_end_rnd' not in params else params[ 'in_end_rnd'] self.sampled_gaussian_idx = None self.stop_R = False if "stop_R" not in params else params['stop_R'] self.nb_eps_after_R = 0 self.expert_means, self.expert_covs, self.expert_mean_rewards = None, None, None # will be defined after pre test self.expert_type = "P" if "expert_type" not in params else params[ "expert_type"] self.r_list_len = 50 if "r_list_len" not in params else params[ "r_list_len"] self.tol_ratio = 1.0 if "tol_ratio" not in params else params[ "tol_ratio"] if self.expert_type == 'R': self.reward_list = deque(maxlen=self.r_list_len) self.expert_idx = -1 self.episode_nb = 0 self.current_means = None self.current_covs = None self.current_mean_r = None # Boring book-keeping #self._update() self.bk = { 'cegt_k': self.k, 'cegt_pt': self.pre_test_epoch_idx, 'cegt_expert_type': self.expert_type, 'cegt_cf': self.classroom_filename, 'cegt_rap': self.restart_after_pretrain, 'stop_R': self.stop_R, 'cegt_covariances': [], 'cegt_means': [], 'cegt_episodes': [self.episode_nb], 'cegt_tasks_origin': [], 'cegt_nb_alpgmm_gaussians': [], 'cegt_expert_idx': [], 'cegt_test_vectors': [] } if self.pre_test_epoch_idx == 0: self.send_test_info(None, epoch_0=True) def send_test_info(self, test_vec, epoch_0=False): self.bk['cegt_test_vectors'].append(test_vec) #print('len test vec is') #print(len(test_vec)) #print(test_vec.shape) if epoch_0: #do not increment if called from init assert (self.random_expert or self.use_ground_truth) else: self.nb_test_epochs += 1 if self.nb_test_epochs == self.pre_test_epoch_idx: # time to find an expert from classroom self.bk['pre_test_vec'] = test_vec # load classroom history path = "teachDRL/data/elders_knowledge/{}.pkl".format( self.classroom_filename) print("loading from {}".format(path)) is_v2 = False if "v2" in self.classroom_filename: is_v2 = True student_ids, initial_test_vectors_list, last_test_vector, last_perfs, student_params = pickle.load( open(path, "rb")) if self.classroom_portion != 100: # take a random sample subpart of classroom sample_len = int( len(student_ids) * (self.classroom_portion / 100)) print('using only {} classroom data sampled randomly'.format( sample_len)) old_rnd_state = random.getstate() random.seed(self.seed) sampled_student_ids = random.sample(student_ids, sample_len) sampled_initial_test_vectors_list = [] for kc_v in initial_test_vectors_list: random.seed(self.seed) sampled_initial_test_vectors_list.append( random.sample(kc_v, sample_len)) random.seed(self.seed) sampled_last_test_vector = random.sample( last_test_vector, sample_len) random.seed(self.seed) sampled_last_perfs = random.sample(last_perfs, sample_len) if self.is_toy_env and is_v2: random.seed(self.seed) student_params['start_cube_idx'] = random.sample( student_params['start_cube_idx'], sample_len) else: print( 'portion of non toy env v2 classroom is not yet supported' ) exit(1) random.setstate(old_rnd_state) # restore random state # set classroom to classroom sample initial_test_vectors_list = sampled_initial_test_vectors_list student_ids = sampled_student_ids last_test_vector = sampled_last_test_vector last_perfs = sampled_last_perfs if self.random_expert: print('choosing expert randomly !') expert_id = np.random.choice(student_ids) else: expert_id = get_k_experts( self.current_student_params, test_vec, initial_test_vectors_list, last_test_vector, student_ids, student_params, last_perfs, k=self.k, use_ground_truth=self.use_ground_truth, test_vec_idx=self.pre_test_epoch_idx - 1, is_toy_env=self.is_toy_env, is_v2=is_v2) self.bk['selected_expert'] = expert_id print('expert selected is: {}'.format(expert_id)) # loading expert folder_path = 'teachDRL/data/elders_knowledge/' + expert_id.rsplit( '_s', 1)[0] + '/' + expert_id print(folder_path) self.expert_means, self.expert_covs, self.expert_mean_rewards = load_expert_trajectory( folder_path, is_toy_env=self.is_toy_env) self._update() # add alpgmm gaussians if self.use_alpgmm and self.alpgmm.gmm is not None: self.current_means += self.alpgmm.gmm.means_.tolist() self.current_covs += self.alpgmm.gmm.covariances_.tolist() self.nb_alpgmm_gaussians = len(self.alpgmm.gmm.means_) return self.restart_after_pretrain return False def _update(self): if self.expert_type == 'P': # Pool type, single GMM out of all expert GMMs #print('P-updating') self.current_means = [ sub_item for sub_list in self.expert_means for sub_item in sub_list ] # flatten self.current_covs = [ sub_item for sub_list in self.expert_covs for sub_item in sub_list ] # same elif self.expert_type == 'T': # Time type, expert trajectory is stepped every 250 episodes #print('T-updating') self.expert_idx = min(self.episode_nb // 250, len(self.expert_means) - 1) self.current_means = self.expert_means[ self.expert_idx].copy() # flatten self.current_covs = self.expert_covs[self.expert_idx].copy() elif self.expert_type == 'R': # Reward type, expert traj is stepped when mean reward > to previous self #print('R-updating') self.expert_idx = min(self.expert_idx + 1, len(self.expert_means) - 1) self.current_means = self.expert_means[self.expert_idx].copy() self.current_covs = self.expert_covs[self.expert_idx].copy() self.current_mean_r = self.expert_mean_rewards[ self.expert_idx] * self.tol_ratio self.reward_list = deque(maxlen=self.r_list_len) else: print('Unknown expert type') exit(1) def update(self, task, reward): #print("current means: {}, covs {}".format(len(self.current_means), len(self.current_covs))) #print("expert_idx: {}".format(self.expert_idx)) self.episode_nb += 1 if self.nb_test_epochs < self.pre_test_epoch_idx: # pre-test phase, only use alp-gmm self.is_new_alpgmm = self.alpgmm.update(task, reward) if self.is_new_alpgmm: self.bk['cegt_covariances'].append( self.alpgmm.gmm.covariances_.copy()) self.bk['cegt_means'].append(self.alpgmm.gmm.means_.copy()) self.bk['cegt_episodes'].append(self.episode_nb) self.bk['cegt_expert_idx'].append(self.expert_idx) return self.is_new_alpgmm just_updated_gmm = False # handle AGAIN-R/T to ALP-GMM transition after finishing expert curriculum if self.use_alpgmm and ( self.expert_type == "R" or self.expert_type == 'T') and self.stop_R and self.expert_idx == ( len(self.expert_means) - 1) and self.nb_alpgmm_gaussians is not None: if self.nb_eps_after_R == 0: # when AGAIN reaches the end of the expert curriculum, it can change rnd sampling self.random_task_ratio = 0.1 self.post_pre_test_task_ratio = self.in_end_rnd # switch back to high-exploration strategy print( 'switching to rnd of {} since last IN idx reached'.format( self.in_end_rnd)) if self.expert_type == 'R': self.expert_type = "stoppedR" self.bk['stoppedR_episode'] = self.episode_nb elif self.expert_type == 'T': self.expert_type = "stoppedT" self.bk['stoppedT_episode'] = self.episode_nb # handle AGAIN-R/T smooth re-update of last IN gaussian if self.use_alpgmm and ( self.expert_type == "stoppedR" or self.expert_type == 'stoppedT') and self.stop_R and self.expert_idx == ( len(self.expert_means) - 1) and self.nb_alpgmm_gaussians is not None: if self.nb_eps_after_R == 0: # first time, init last IN GMM gaussian tracking to update ALP periodically self.last_IN_gaussians_alps = [ deque(maxlen=100) for _ in range( len(self.current_means) - self.nb_alpgmm_gaussians) ] self.added_since_fit = 0 assert ((len(self.current_means) - self.nb_alpgmm_gaussians) == len( self.expert_means[-1])) print('TIME TO START POST IN, last expert has len {} --> {}'. format(len(self.expert_means[-1]), self.expert_means[-1])) elif self.added_since_fit == 100: # time to re-update the final IN lps gmm print('last in update time') #print(self.last_IN_gaussians_alps) just_updated_gmm = True self.added_since_fit = 0 for i, alp_window in enumerate(self.last_IN_gaussians_alps): if len(alp_window) == 0: self.current_means[i][-1] = 0.0 else: self.current_means[i][-1] = np.mean(alp_window) # remove alp-gmm gaussians to fit update pipeline (they will be re-added self.current_means = self.current_means[:-self. nb_alpgmm_gaussians] self.current_covs = self.current_covs[:-self. nb_alpgmm_gaussians] print('post in update to {}'.format(self.current_means)) if self.sampled_gaussian_idx < ( len(self.current_means) - self.nb_alpgmm_gaussians): # last task from IN #print('adding alp to IN idx {} out of {}'.format(self.sampled_gaussian_idx,len(self.current_means) - self.nb_alpgmm_gaussians)) self.last_IN_gaussians_alps[self.sampled_gaussian_idx].append( self.alpgmm.alps[-1]) self.added_since_fit += 1 self.nb_eps_after_R += 1 # handle IN-R to ALP-GMM transition after finishing expert curriculum if self.expert_type == "R" and self.stop_R and self.expert_idx == ( len(self.expert_means) - 1): self.use_alpgmm = True self.nb_eps_after_R += 1 if self.nb_eps_after_R == 250: # after a long time in last expert index, change strategy self.expert_type = "stoppedR" self.random_task_ratio = 0.1 self.bk['stoppedR_episode'] = self.episode_nb # replace last expert idx by alpgmm gaussians self.current_means = [] self.current_covs = [] self.nb_alpgmm_gaussians = len(self.alpgmm.gmm.means_) just_updated_gmm = True # PROCESS DATA FOR R or T variants if self.expert_type == 'R' and self.bk['cegt_tasks_origin'][ -1] == 'egt': # add reward to list if from egt self.reward_list.append(reward) # check whether a GMM update is necessary, depending on the expert type if (self.expert_type == 'T' and (self.episode_nb % 250) == 0)\ or (self.expert_type == 'R' and len(self.reward_list) == self.r_list_len and np.mean(self.reward_list) >= self.current_mean_r): if self.expert_idx != ( len(self.expert_means) - 1): # if not already at the end of expert curricula self._update() just_updated_gmm = True if self.use_alpgmm: if just_updated_gmm and self.nb_alpgmm_gaussians is not None: # expert changed, add alpgmm part self.current_means += self.alpgmm.gmm.means_.tolist() self.current_covs += self.alpgmm.gmm.covariances_.tolist() # send data to alpgmm self.is_new_alpgmm = self.alpgmm.update(task, reward) if self.is_new_alpgmm: # update current GMM by replacing old gaussians from alpgmm with new ones if self.nb_alpgmm_gaussians is not None: # remove old gaussians self.current_means = self.current_means[:-self. nb_alpgmm_gaussians] self.current_covs = self.current_covs[:-self. nb_alpgmm_gaussians] # add new gaussians #print('adding stuff') self.current_means += self.alpgmm.gmm.means_.tolist() self.current_covs += self.alpgmm.gmm.covariances_.tolist() self.nb_alpgmm_gaussians = len(self.alpgmm.gmm.means_) just_updated_gmm = True # book-keeping if just_updated_gmm: self.bk['cegt_covariances'].append(self.current_covs.copy()) self.bk['cegt_means'].append(self.current_means.copy()) self.bk['cegt_episodes'].append(self.episode_nb) self.bk['cegt_expert_idx'].append(self.expert_idx) self.bk['cegt_nb_alpgmm_gaussians'].append( self.nb_alpgmm_gaussians) return just_updated_gmm def sample_task(self): new_task = None task_origin = None #print(self.episode_nb) # pre-test phase, only use alp-gmm if self.nb_test_epochs < self.pre_test_epoch_idx: #print('pre-test-task-sampling') if (self.episode_nb < 250) or (np.random.random() < self.random_task_ratio): # Random task sampling new_task = self.alpgmm.random_task_generator.sample() task_origin = 'random' else: # alp-gmm task sampling task_origin = 'alpgmm' alp_means = [] for pos in self.alpgmm.gmm.means_: alp_means.append(pos[-1]) # 2 - Sample Gaussian proportionally to its mean ALP idx = proportional_choice(alp_means, eps=0.0) # 3 - Sample task in Gaussian, without forgetting to remove ALP dimension new_task = np.random.multivariate_normal( self.alpgmm.gmm.means_[idx], self.alpgmm.gmm.covariances_[idx])[:-1] new_task = np.clip(new_task, self.mins, self.maxs).astype(np.float32) self.bk['cegt_tasks_origin'].append(task_origin) return new_task #print(self.random_task_ratio) if self.use_alpgmm and np.random.random( ) < self.post_pre_test_task_ratio: # Random task sampling new_task = self.alpgmm.random_task_generator.sample() task_origin = 'random' else: # ALP-based task sampling # 1 - Retrieve the mean ALP value of each Gaussian in the GMM alp_means = [] for means in self.current_means: alp_means.append(means[-1]) # 2 - Sample Gaussian proportionally to its mean ALP idx = proportional_choice(alp_means, eps=0.0) self.sampled_gaussian_idx = idx # 3 - Sample task in Gaussian, without forgetting to remove ALP dimension new_task = np.random.multivariate_normal( self.current_means[idx], self.current_covs[idx])[:-1] new_task = np.clip(new_task, self.mins, self.maxs).astype(np.float32) task_origin = 'egt' if self.use_alpgmm and self.alpgmm.gmm is not None: if idx >= len(self.current_means) - self.nb_alpgmm_gaussians: task_origin = 'alpgmm' #print(task_origin) # boring book-keeping self.bk['cegt_tasks_origin'].append(task_origin) return new_task def dump(self, dump_dict): self.bk['cegt_initial_expert_means'] = self.expert_means self.bk['cegt_initial_expert_covs'] = self.expert_covs self.bk['cegt_student_param'] = self.current_student_params if self.expert_type == 'R' or self.expert_type == "stoppedR": self.bk[ 'cegt_initial_expert_mean_rewards'] = self.expert_mean_rewards dump_dict.update(self.bk) if self.use_alpgmm: dump_dict.update(self.alpgmm.bk) return dump_dict
class EGT(): def __init__(self, mins, maxs, seed=None, params=dict()): self.seed = seed if not seed: self.seed = np.random.randint(42, 424242) np.random.seed(self.seed) # Task space boundaries self.mins = np.array(mins) self.maxs = np.array(maxs) self.use_alpgmm = False if "use_alpgmm" not in params else params[ 'use_alpgmm'] #self.decorelate_alpgmm = False if "decorelate_alpgmm" not in params else params['decorelate_alpgmm'] self.nb_alpgmm_gaussians = None if self.use_alpgmm: print("Using ALP-GMM with EGT") self.alpgmm = ALPGMM(mins, maxs, seed=seed, params=params) self.is_new_alpgmm = False # boolean used to track alpgmm's periodic updates self.random_task_ratio = 0.02 if "random_task_ratio" not in params else params[ "random_task_ratio"] self.sampled_gaussian_idx = None self.stop_R = False if "stop_R" not in params else params['stop_R'] self.nb_eps_after_R = 0 assert ('expert_gmms' in params) self.expert_means, self.expert_covs, self.expert_mean_rewards = params[ 'expert_gmms'] self.expert_type = "P" if "expert_type" not in params else params[ "expert_type"] self.r_list_len = 50 if "r_list_len" not in params else params[ "r_list_len"] self.tol_ratio = 1.0 if "tol_ratio" not in params else params[ "tol_ratio"] if self.expert_type == 'R': self.reward_list = deque(maxlen=self.r_list_len) self.expert_idx = -1 self.episode_nb = 0 self.current_means = None self.current_covs = None self.current_mean_r = None # Boring book-keeping self._update() self.bk = { 'egt_covariances': [self.current_covs.copy()], 'egt_means': [self.current_means.copy()], 'egt_episodes': [self.episode_nb], 'egt_tasks_origin': [], 'egt_nb_alpgmm_gaussians': [self.nb_alpgmm_gaussians], 'egt_expert_idx': [self.expert_idx] } def _update(self): if self.expert_type == 'P': # Pool type, single GMM out of all expert GMMs #print('P-updating') self.current_means = [ sub_item for sub_list in self.expert_means for sub_item in sub_list ] # flatten self.current_covs = [ sub_item for sub_list in self.expert_covs for sub_item in sub_list ] # same elif self.expert_type == 'T': # Time type, expert trajectory is stepped every 250 episodes #print('T-updating') self.expert_idx = min(self.episode_nb // 250, len(self.expert_means) - 1) self.current_means = self.expert_means[ self.expert_idx].copy() # flatten self.current_covs = self.expert_covs[self.expert_idx].copy() elif self.expert_type == 'R': # Reward type, expert traj is stepped when mean reward > to previous self #print('R-updating') self.expert_idx = min(self.expert_idx + 1, len(self.expert_means) - 1) self.current_means = self.expert_means[self.expert_idx].copy() self.current_covs = self.expert_covs[self.expert_idx].copy() self.current_mean_r = self.expert_mean_rewards[ self.expert_idx] * self.tol_ratio self.reward_list = deque(maxlen=self.r_list_len) else: print('Unknown expert type') exit(1) def update(self, task, reward): #print("current means: {}, covs {}".format(len(self.current_means), len(self.current_covs))) #print("expert_idx: {}".format(self.expert_idx)) self.episode_nb += 1 just_updated_gmm = False if self.use_alpgmm and self.expert_type == "R" and self.stop_R and self.expert_idx == ( len(self.expert_means) - 1) and self.nb_alpgmm_gaussians is not None: self.nb_eps_after_R += 1 if self.nb_eps_after_R == 250: # after a long time in last expert index, change strategy self.expert_type = "stoppedR" self.random_task_ratio = 0.1 self.current_means = [] self.current_covs = [] self.bk['stoppedR_episode'] = self.episode_nb just_updated_gmm = True # process new data if self.expert_type == 'R' and self.bk['egt_tasks_origin'][ -1] == 'egt': # add reward to list if from egt self.reward_list.append(reward) # check whether a GMM update is necessary, depending on the expert type if (self.expert_type == 'T' and (self.episode_nb % 250) == 0)\ or (self.expert_type == 'R' and len(self.reward_list) == self.r_list_len and np.mean(self.reward_list) > self.current_mean_r): if self.expert_idx != ( len(self.expert_means) - 1): # if not already at the end of expert curricula self._update() just_updated_gmm = True if self.use_alpgmm: if just_updated_gmm and self.nb_alpgmm_gaussians is not None: # expert changed, add alpgmm part self.current_means += self.alpgmm.gmm.means_.tolist() self.current_covs += self.alpgmm.gmm.covariances_.tolist() # send data to alpgmm self.is_new_alpgmm = self.alpgmm.update(task, reward) if self.is_new_alpgmm: # update current GMM by replacing old gaussians from alpgmm with new ones if self.nb_alpgmm_gaussians is not None: # remove old gaussians self.current_means = self.current_means[:-self. nb_alpgmm_gaussians] self.current_covs = self.current_covs[:-self. nb_alpgmm_gaussians] # add new gaussians #print('adding stuff') self.current_means += self.alpgmm.gmm.means_.tolist() self.current_covs += self.alpgmm.gmm.covariances_.tolist() self.nb_alpgmm_gaussians = len(self.alpgmm.gmm.means_) just_updated_gmm = True # # smoothly update the ALP value of expert gaussians if at last IEC index # if self.expert_idx == (len(self.expert_means) - 1) and self.use_alpgmm and self.nb_alpgmm_gaussians is not None: # if self.expert_type == 'T' or (self.expert_type == 'R' and np.mean(self.reward_list) > self.current_mean_r): # if self.bk['egt_tasks_origin'][-1] == 'egt': # assert(self.sampled_gaussian_idx < (len(self.current_means) - self.nb_alpgmm_gaussians)) # cur_alp = self.current_means[self.sampled_gaussian_idx][-1] # update alp of corresponding Gaussian # self.current_means[self.sampled_gaussian_idx][-1] = cur_alp * (49/50) + (self.alpgmm.alps[-1]/50) # book-keeping if just_updated_gmm: self.bk['egt_covariances'].append(self.current_covs.copy()) self.bk['egt_means'].append(self.current_means.copy()) self.bk['egt_episodes'].append(self.episode_nb) self.bk['egt_expert_idx'].append(self.expert_idx) self.bk['egt_nb_alpgmm_gaussians'].append(self.nb_alpgmm_gaussians) def sample_task(self): new_task = None task_origin = None if self.use_alpgmm and np.random.random() < self.random_task_ratio: # Random task sampling new_task = self.alpgmm.random_task_generator.sample() task_origin = 'random' else: # ALP-based task sampling # 1 - Retrieve the mean ALP value of each Gaussian in the GMM alp_means = [] for means in self.current_means: alp_means.append(means[-1]) # 2 - Sample Gaussian proportionally to its mean ALP idx = proportional_choice(alp_means, eps=0.0) self.sampled_gaussian_idx = idx # 3 - Sample task in Gaussian, without forgetting to remove ALP dimension new_task = np.random.multivariate_normal( self.current_means[idx], self.current_covs[idx])[:-1] new_task = np.clip(new_task, self.mins, self.maxs).astype(np.float32) task_origin = 'egt' if self.use_alpgmm and self.alpgmm.gmm is not None: if idx >= len(self.current_means) - self.nb_alpgmm_gaussians: task_origin = 'alpgmm' #print(task_origin) # boring book-keeping self.bk['egt_tasks_origin'].append(task_origin) return new_task def dump(self, dump_dict): self.bk['egt_initial_expert_means'] = self.expert_means self.bk['egt_initial_expert_covs'] = self.expert_covs if self.expert_type == 'R' or self.expert_type == "stoppedR": self.bk[ 'egt_initial_expert_mean_rewards'] = self.expert_mean_rewards dump_dict.update(self.bk) if self.use_alpgmm: dump_dict.update(self.alpgmm.bk) return dump_dict