def collect_samples(pid, queue, env, policy, custom_reward, mean_action, render, running_state, min_batch_size): # (2) torch.randn(pid) log = dict() memory = Memory( ) # every time we collect a batch he memory is re-initialized num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 while num_steps < min_batch_size: # collecting samples from episodes until we at least a batch state = env.reset() # (maybe more since we stop when episode ends) if running_state is not None: state = running_state(state) reward_episode = 0 episode = [] for t in range( 10000 ): # in gym.env there's already an upper bound to the number of steps state_var = tensor(state).unsqueeze(0) with torch.no_grad(): action_mean, action_log_std, action_std = policy(state_var) if mean_action: action = action_mean.numpy() # use mean value else: action = policy.select_action(state_var)[0].numpy( ) # sample from normal distribution action = int(action) if policy.is_disc_action else action.astype( np.float64) next_state, reward, done, _ = env.step(action) reward_episode += reward if running_state is not None: # running list of normalized states allowing to access precise mean and std next_state = running_state(next_state) if custom_reward is not None: # by default is None, unless given when init Agent reward = custom_reward(state, action) total_c_reward += reward min_c_reward = min(min_c_reward, reward) max_c_reward = max(max_c_reward, reward) episode.append( Transition(state, action, next_state, reward, action_mean, action_std.numpy(), None)) if render: env.render() if done: memory.push(episode) break state = next_state # log stats num_steps += (t + 1) num_episodes += 1 total_reward += reward_episode min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_reward'] = max_reward log['min_reward'] = min_reward if custom_reward is not None: log['total_c_reward'] = total_c_reward log['avg_c_reward'] = total_c_reward / num_steps log['max_c_reward'] = max_c_reward log['min_c_reward'] = min_c_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log
def collect_samples(pid, env, policy, num_req_steps, num_req_episodes, mean_action, render, running_state, context_points_list, attention, fixed_sigma): torch.randn(pid) log = dict() memory = Memory() num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 action_sum = zeros(context_points_list[0][1].shape[-1]) with torch.no_grad(): all_x_context, all_y_context = merge_context( context_points_list) # merge episodes in one context set # compute step-independent values if policy.id == 'DKL': policy.set_train_data(inputs=all_x_context.squeeze(0), targets=all_y_context.view(-1), strict=False) elif policy.id in 'ANP': # compute context representation and latent variable if attention: encoder_input, keys = policy.xy_to_a.get_input_key( all_x_context, all_y_context) else: r_context = policy.xy_to_r(all_x_context, all_y_context) _, z_dist = policy.sample_z(all_x_context, all_y_context) while num_steps < num_req_steps or num_episodes < num_req_episodes: episode = [] reward_episode = 0 if policy.id in 'ANP': z_sample = z_dist.sample() if not attention: rep = torch.cat([z_sample, r_context], dim=-1) state = env.reset() if running_state is not None: state = running_state(state) t_ep = time.time() for t in range(10000): state_var = tensor(state).unsqueeze(0).unsqueeze(0) if policy.id == 'DKL': with gpytorch.settings.use_toeplitz( True), gpytorch.settings.fast_pred_var(): pi = policy(state_var) mean = pi.mean stddev = pi.stddev elif policy.id == 'MI': mean = policy(all_x_context, all_y_context, state_var) stddev = fixed_sigma else: # NPs and ANPs if attention: a_repr = policy.xy_to_a.get_repr( encoder_input, keys, state_var) representation = torch.cat( [z_sample, a_repr.squeeze(0)], dim=-1) mean, stddev = policy.xz_to_y(state_var, representation) else: mean, stddev = policy.xrep_to_y(state_var, rep) if fixed_sigma is not None: sigma = fixed_sigma # use sigma learnt by update step else: sigma = stddev.view(-1) # use predicted sigma (NPs) action_distribution = Normal(mean, sigma) if mean_action: action = mean.view(-1) # use mean value mean_rep = torch.cat([z_dist.mean, r_context], dim=-1) mean, stddev = policy.xrep_to_y(state_var, mean_rep) mean_s, _ = policy.xrep_to_y( state_var, torch.cat([z_dist.mean + z_dist.stddev, r_context], dim=-1)) sigma = torch.abs(mean_s - mean) else: action = action_distribution.sample().view( -1) # sample from normal distribution cov = torch.diag(sigma.view(-1)**2) next_state, reward, done, _ = env.step(action.cpu().numpy()) reward_episode += reward if running_state is not None: # running list of normalized states allowing to access precise mean and std next_state = running_state(next_state) episode.append( Transition(state, action.cpu().numpy(), next_state, reward, mean.cpu().numpy(), sigma.cpu().numpy(), None, cov)) action_sum += action if render: env.render() if done: memory.push(episode) break state = next_state # log stats num_steps += (t + 1) num_episodes += 1 total_reward += reward_episode min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) print('tot episodes: ', num_episodes) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward try: log['avg_reward'] = total_reward.item() / num_episodes except AttributeError: log['avg_reward'] = total_reward / num_episodes log['max_reward'] = max_reward log['min_reward'] = min_reward log['action_mean'] = action_sum / num_steps return memory, log
def collect_samples_mlp(pid, env, policy, num_req_steps, num_req_episodes, custom_reward, render, running_state, fixed_sigma): torch.randn(pid) log = dict() memory = Memory( ) # every time we collect a batch he memory is re-initialized num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 with torch.no_grad(): while num_steps < num_req_steps or num_episodes < num_req_episodes: episode = [] reward_episode = 0 state = env.reset() if running_state is not None: state = running_state(state) t_ep = time.time() for t in range(10000): state_var = tensor(state).unsqueeze(0).unsqueeze(0) pi = policy(state_var) mean = pi #stddev = pi.stddev sigma = fixed_sigma cov = torch.diag(sigma**2) action_distribution = Normal(mean, sigma) action = action_distribution.sample( ) # sample from normal distribution next_state, reward, done, _ = env.step(action.cpu()) reward_episode += reward if running_state is not None: next_state = running_state(next_state) if custom_reward is not None: reward = custom_reward(state, action) total_c_reward += reward min_c_reward = min(min_c_reward, reward) max_c_reward = max(max_c_reward, reward) episode.append( Transition(state, action.cpu().numpy(), next_state, reward, mean.cpu().numpy(), sigma.cpu().numpy(), None, cov)) if render: env.render() if done: memory.push(episode) break state = next_state # log stats num_steps += (t + 1) num_episodes += 1 total_reward += reward_episode min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) print('tot episodes: ', num_episodes) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward try: log['avg_reward'] = total_reward.item() / num_episodes except AttributeError: log['avg_reward'] = total_reward / num_episodes log['max_reward'] = max_reward log['min_reward'] = min_reward if custom_reward is not None: log['total_c_reward'] = total_c_reward log['avg_c_reward'] = total_c_reward / num_steps log['max_c_reward'] = max_c_reward log['min_c_reward'] = min_c_reward return memory, log
def collect_samples(pid, env, policy, custom_reward, mean_action, render, running_state, context_points_list, attention, fixed_sigma): # (2) torch.randn(pid) log = dict() memory = Memory( ) # every time we collect a batch he memory is re-initialized num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 action_sum = zeros(context_points_list[0][1].shape[-1]) with torch.no_grad(): for ep in range(len(context_points_list)): all_x_context_list = [context_points_list[0][0][:, [0], :]] all_y_context_list = [context_points_list[0][1][:, [0], :]] episode = [] reward_episode = 0 state = env.reset() if running_state is not None: state = running_state(state) t_ep = time.time() for t in range(10000): all_x_context = torch.cat(all_x_context_list, dim=1) all_y_context = torch.cat(all_y_context_list, dim=1) state_var = tensor(state).unsqueeze(0).unsqueeze(0) if policy.id == 'DKL': with gpytorch.settings.use_toeplitz( True), gpytorch.settings.fast_pred_var(): pi = policy(state_var) mean = pi.mean stddev = pi.stddev if torch.isnan(stddev): print(stddev) elif policy.id == 'MI': mean = policy(all_x_context, all_y_context, state_var) stddev = fixed_sigma else: if attention: pi = policy(all_x_context, all_y_context, state_var) mean = pi.mean stddev = pi.stddev else: pi = policy(all_x_context, all_y_context, state_var) mean = pi.mean stddev = pi.stddev if fixed_sigma is not None: sigma = fixed_sigma else: sigma = stddev action_distribution = Normal(mean, sigma) if mean_action: action = mean # use mean value mean, stddev = policy.xz_to_y(state_var, z_dist.mean) else: action = action_distribution.sample().view( -1) # sample from normal distribution cov = torch.diag(sigma**2) next_state, reward, done, _ = env.step(action.cpu()) reward_episode += reward if running_state is not None: # running list of normalized states allowing to access precise mean and std next_state = running_state(next_state) if custom_reward is not None: # by default is None, unless given when init Agent reward = custom_reward(state, action) total_c_reward += reward min_c_reward = min(min_c_reward, reward) max_c_reward = max(max_c_reward, reward) if any(torch.isnan(state_var.view(-1))) or any( torch.isnan(action.view(-1))) or any( torch.isnan(mean.view(-1))): print('wat') all_x_context_list.append(state_var) all_y_context_list.append(mean) episode.append( Transition(state, action.cpu().numpy(), next_state, reward, mean.cpu().numpy(), sigma.cpu().numpy(), None, cov)) action_sum += action if render: env.render() if done: memory.push(episode) break state = next_state # log stats num_steps += (t + 1) num_episodes += 1 total_reward += reward_episode min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward try: log['avg_reward'] = total_reward.item() / num_episodes except AttributeError: log['avg_reward'] = total_reward / num_episodes log['max_reward'] = max_reward log['min_reward'] = min_reward log['action_mean'] = action_sum / num_steps if custom_reward is not None: log['total_c_reward'] = total_c_reward log['avg_c_reward'] = total_c_reward / num_steps log['max_c_reward'] = max_c_reward log['min_c_reward'] = min_c_reward return memory, log
def collect_samples(pid, env, policy, num_ep, custom_reward, render, running_state, fixed_sigma): # (2) torch.randn(pid) log = dict() memory = Memory( ) # every time we collect a batch he memory is re-initialized num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 with torch.no_grad(): for ep in range(num_ep): episode = [] reward_episode = 0 state = env.reset() if running_state is not None: state = running_state(state) t_ep = time.time() for t in range(10000): state_var = tensor(state).unsqueeze(0).unsqueeze(0) pi = policy(state_var) mean = pi.mean stddev = pi.stddev if fixed_sigma is not None: sigma = fixed_sigma else: sigma = stddev action_distribution = Normal(mean, sigma) action = action_distribution.sample().squeeze(0).squeeze( 0) # sample from normal distribution next_state, reward, done, _ = env.step(action) reward_episode += reward if running_state is not None: # running list of normalized states allowing to access precise mean and std next_state = running_state(next_state) if custom_reward is not None: # by default is None, unless given when init Agent reward = custom_reward(state, action) total_c_reward += reward min_c_reward = min(min_c_reward, reward) max_c_reward = max(max_c_reward, reward) episode.append( Transition(state, action.numpy(), next_state, reward, mean.numpy(), stddev.numpy(), None)) if render: env.render() if done: memory.push(episode) break state = next_state # log stats num_steps += (t + 1) num_episodes += 1 total_reward += reward_episode min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_reward'] = max_reward log['min_reward'] = min_reward if custom_reward is not None: log['total_c_reward'] = total_c_reward log['avg_c_reward'] = total_c_reward / num_steps log['max_c_reward'] = max_c_reward log['min_c_reward'] = min_c_reward return memory, log
def collect_samples(pid, env, policy, custom_reward, mean_action, render, running_state, context_points_list, attention, fixed_sigma): # (2) torch.randn(pid) log = dict() memory = Memory( ) # every time we collect a batch he memory is re-initialized num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 for episode_contexts in context_points_list: episode = [] reward_episode = 0 x_context, y_context, real_len = episode_contexts if attention: encoder_input, keys = policy.xy_to_a.get_input_key( x_context[:real_len], y_context[:real_len]) _, z_dist = policy.sample_z(x_context[:real_len], y_context[:real_len]) state = env.reset() if running_state is not None: state = running_state(state) z_sample = z_dist.sample() for t in range(10000): state_var = tensor(state).unsqueeze(0).unsqueeze(0) with torch.no_grad(): if attention: a_repr = policy.xy_to_a.get_repr(encoder_input, keys, state_var) representation = torch.cat( [z_sample, a_repr.squeeze(0)], dim=-1) mean, stddev = policy.xz_to_y(state_var, representation) else: mean, stddev = policy.xz_to_y(state_var, z_sample) if fixed_sigma is not None: sigma = fixed_sigma else: sigma = stddev action_distribution = Normal(mean, sigma) if mean_action: action = mean # use mean value mean, stddev = policy.xz_to_y(state_var, z_dist.mean) else: action = action_distribution.sample().squeeze(0).squeeze( 0) # sample from normal distribution next_state, reward, done, _ = env.step(action) reward_episode += reward if running_state is not None: # running list of normalized states allowing to access precise mean and std next_state = running_state(next_state) if custom_reward is not None: # by default is None, unless given when init Agent reward = custom_reward(state, action) total_c_reward += reward min_c_reward = min(min_c_reward, reward) max_c_reward = max(max_c_reward, reward) episode.append( Transition(state, action.numpy(), next_state, reward, mean.numpy(), stddev.numpy(), None)) if render: env.render() if done: memory.push(episode) break state = next_state # log stats num_steps += (t + 1) num_episodes += 1 total_reward += reward_episode min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_reward'] = max_reward log['min_reward'] = min_reward if custom_reward is not None: log['total_c_reward'] = total_c_reward log['avg_c_reward'] = total_c_reward / num_steps log['max_c_reward'] = max_c_reward log['min_c_reward'] = min_c_reward return memory, log
def collect_samples(pid, env, policy, num_req_steps, num_req_episodes, num_context, render, running_state, context_points_list, pick_dist, fixed_sigma): torch.randn(pid) log = dict() memory = Memory() num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 action_sum = zeros(context_points_list[0][1].shape[-1]) # merge all episodes in RM in a single set all_x = torch.cat([ep[0][:ep[-1], :] for ep in context_points_list], dim=-2) all_y = torch.cat([ep[1][:ep[-1], :] for ep in context_points_list], dim=-2) num_tot_context = all_x.shape[-2] if num_tot_context < num_context: # no need to select a subset pick = False all_x_context, all_y_context = [all_x.view(1, num_tot_context, -1), all_y.view(1, num_tot_context, -1)] else: pick = True with torch.no_grad(): while num_steps < num_req_steps or num_episodes < num_req_episodes: # print('ep: ', ep) episode = [] reward_episode = 0 state = env.reset() if running_state is not None: state = running_state(state) t_ep = time.time() for t in range(10000): state_var = tensor(state).unsqueeze(0).unsqueeze(0) if pick: all_x_context, all_y_context = get_close_context(t, state_var, context_points_list, pick_dist, num_tot_context=num_context) if policy.id == 'DKL': policy.set_train_data(all_x_context.squeeze(0), all_y_context.squeeze(0).squeeze(-1), strict=False) pi = policy(state_var) mean = pi.mean stddev = pi.stddev elif policy.id == 'MI': mean = policy(all_x_context, all_y_context, state_var) stddev = fixed_sigma else: pi = policy(all_x_context, all_y_context, state_var) mean = pi.mean stddev = pi.stddev if fixed_sigma is not None: sigma = fixed_sigma else: sigma = stddev.view(-1) cov = torch.diag(sigma ** 2) action_distribution = MultivariateNormal(mean, cov) action = action_distribution.sample().view(-1) # sample from normal distribution next_state, reward, done, _ = env.step(action.cpu().numpy()) reward_episode += reward if running_state is not None: # running list of normalized states allowing to access precise mean and std next_state = running_state(next_state) episode.append(Transition(state, action.cpu().numpy(), next_state, reward, mean.cpu().numpy(), sigma.cpu().numpy(), None, cov)) action_sum += action if render: env.render() if done: memory.push(episode) break state = next_state # log stats num_steps += (t + 1) num_episodes += 1 total_reward += reward_episode min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) print('tot episodes: ', num_episodes) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward try: log['avg_reward'] = total_reward.item() / num_episodes except AttributeError: log['avg_reward'] = total_reward / num_episodes log['max_reward'] = max_reward log['min_reward'] = min_reward log['action_mean'] = action_sum / num_steps return memory, log