def make_multivariatenormal_dist(self, is_gpu=False): loc = numpy.random.uniform( -1, 1, self.shape + (3,)).astype(numpy.float32) cov = numpy.random.normal(size=(numpy.prod(self.shape),) + (3, 3)) cov = [cov_.dot(cov_.T) for cov_ in cov] cov = numpy.vstack(cov).reshape(self.shape + (3, 3)) scale_tril = numpy.linalg.cholesky(cov).astype(numpy.float32) params = self.encode_params( {"loc": loc, "scale_tril": scale_tril}, is_gpu) return distributions.MultivariateNormal(**params)
def train_lgc(args, model): """ Train a stochastic (Gaussian) policy that acts on [z_t,h_t] in a virtual world dictated by model Use policy gradient. :param args: :param vision: :param model: :return: coefficients of linear controller, W_c and b_c in W_c [z_t,h_t] + b_c """ episode_durations = [] random_rollouts_dir = os.path.join(args.data_dir, args.game, args.experiment_name, 'random_rollouts') initial_z_t = ModelDataset(dir=random_rollouts_dir, load_batch_size=args.initial_z_size, verbose=False) num_episode = 10 batch_size = 5 gamma = 0.99 policy_net = PolicyNet(args) optimizer = optimizers.MomentumSGD(lr=0.01, momentum=0.9) optimizer.setup(policy_net) if args.gpu < 0: gpu = None # Batch History state_pool = [] action_pool = [] reward_pool = [] steps = 0 for e in range(num_episode): # grab initial state tuple (z_t, h_t, c_t) from historical random rollouts z_t, _, _, _, _ = initial_z_t[np.random.randint(len(initial_z_t))] z_t = z_t[0] if gpu is not None: z_t = cuda.to_gpu(z_t) if args.initial_z_noise > 0.: if gpu is not None: z_t += cp.random.normal(0., args.initial_z_noise, z_t.shape).astype(cp.float32) else: z_t += np.random.normal(0., args.initial_z_noise, z_t.shape).astype(np.float32) if gpu is not None: h_t = cp.zeros(args.hidden_dim).astype(cp.float32) c_t = cp.zeros(args.hidden_dim).astype(cp.float32) else: h_t = np.zeros(args.hidden_dim).astype(np.float32) c_t = np.zeros(args.hidden_dim).astype(np.float32) for t in count(): mean_a_t = policy_net(args, z_t, h_t, c_t) action_policy_std = 0.1 cov = action_policy_std * np.identity(args.action_dim) stochastic_policy = D.MultivariateNormal( loc=mean_a_t.astype(np.float32), scale_tril=cov.astype(np.float32)) a_t = stochastic_policy.sample() z_t, done = model(z_t, a_t, temperature=args.temperature) done = done.data[0] reward = 1.0 if done >= args.done_threshold: done = True else: done = False h_t = model.get_h().data[0] c_t = model.get_c().data[0] state_pool.append((z_t, h_t, c_t)) action_pool.append(a_t) reward_pool.append(reward) steps += 1 if done: episode_durations.append(t + 1) break # Update policy if e > 0 and e % batch_size == 0: # Discount reward running_add = 0 for i in reversed(range(steps)): if reward_pool[i] == 0: running_add = 0 else: running_add = running_add * gamma + reward_pool[i] reward_pool[i] = running_add # Normalize reward reward_mean = np.mean(reward_pool) reward_std = np.std(reward_pool) for i in range(steps): reward_pool[i] = (reward_pool[i] - reward_mean) / reward_std # Gradient Desent policy_net.cleargrads() for i in range(steps): z_t, h_t, c_t = state_pool[i] action = action_pool[i] reward = reward_pool[i] mean_a_t = policy_net(args, z_t, h_t, c_t) action_policy_std = 0.1 cov = action_policy_std * np.identity(args.action_dim) stochastic_policy = D.MultivariateNormal( loc=mean_a_t.astype(np.float32), scale_tril=cov.astype(np.float32)) loss = -stochastic_policy.log_prob( action) * reward # Negtive score function x reward loss.backward() optimizer.update() state_pool = [] action_pool = [] reward_pool = [] steps = 0 return policy_net