def __init__(self): self.env = gym.make('Pushing2D-v1') self.task_hor = TASK_HORIZON self.agent = Agent(self.env) self.model = PENN(NUM_NETS, STATE_DIM, ACTION_DIM, LR) self.policy = MPC(self.env, NUM_PARTICLES, PLAN_HOR, self.model, POPSIZE, NUM_ELITES, MAX_ITERS)
def __init__(self, env_name='Pushing2D-v1', num_nets=1, mpc_params=None): self.env = gym.make(env_name) self.task_horizon = TASK_HORIZON self.agent = Agent(self.env) mpc_params['use_gt_dynamics'] = False self.model = PENN(num_nets, STATE_DIM, len(self.env.action_space.sample()), LR) self.cem_policy = MPC(self.env, PLAN_HORIZON, self.model, POPSIZE, NUM_ELITES, MAX_ITERS, **mpc_params, use_random_optimizer=False) self.random_policy = MPC(self.env, PLAN_HORIZON, self.model, POPSIZE, NUM_ELITES, MAX_ITERS, **mpc_params, use_random_optimizer=True) self.random_policy_no_mpc = RandomPolicy( len(self.env.action_space.sample()))
def __init__(self, env_name='Pushing2D-v1', num_nets=1, mpc_params=None): self.env = gym.make(env_name) # self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.device = torch.device('cpu') self.task_horizon = TASK_HORIZON # Tensorboard logging. self.timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') self.environment_name = "pusher" self.logdir = 'logs/%s/%s' % (self.environment_name, self.timestamp) self.summary_writer = SummaryWriter(self.logdir) self.agent = Agent(self.env) mpc_params['use_gt_dynamics'] = False self.model = PENN(num_nets, STATE_DIM, len(self.env.action_space.sample()), LR, self.device, self.summary_writer, self.timestamp, self.environment_name) self.cem_policy = MPC(self.env, PLAN_HORIZON, self.model, POPSIZE, NUM_ELITES, MAX_ITERS, use_random_optimizer=False, **mpc_params) self.random_policy = MPC(self.env, PLAN_HORIZON, self.model, POPSIZE, NUM_ELITES, MAX_ITERS, use_random_optimizer=True, **mpc_params) self.random_policy_no_mpc = RandomPolicy( len(self.env.action_space.sample()))
class Experiment: def __init__(self): self.env = gym.make('Pushing2D-v1') self.task_hor = TASK_HORIZON self.agent = Agent(self.env) self.model = PENN(NUM_NETS, STATE_DIM, ACTION_DIM, LR) self.policy = MPC(self.env, NUM_PARTICLES, PLAN_HOR, self.model, POPSIZE, NUM_ELITES, MAX_ITERS) def test(self, num_episodes): samples = [] for j in range(num_episodes): samples.append( self.agent.sample( self.task_hor, self.policy ) ) print("Rewards obtained:", np.mean([sample["reward_sum"] for sample in samples])) print("Percent success:", np.mean([sample["rewards"][-1]==0 for sample in samples])) return np.mean([sample["rewards"][-1]==0 for sample in samples]) def train(self): traj_obs, traj_acs, traj_rets, traj_rews = [], [], [], [] test_results = [] samples = [] rand_pol = RandomPolicy(2) for i in range(NINIT_ROLLOUTS): samples.append(self.agent.sample(self.task_hor, rand_pol)) traj_obs.append(samples[-1]["obs"]) traj_acs.append(samples[-1]["ac"]) traj_rews.append(samples[-1]["rewards"]) if NINIT_ROLLOUTS>0: self.policy.train( [sample["obs"] for sample in samples], [sample["ac"] for sample in samples], [sample["rewards"] for sample in samples], epochs=10 ) for i in range(NTRAIN_ITERS): print("####################################################################") print("Starting training iteration %d." % (i + 1)) samples = [] for j in range(NROLLOUTS_PER_ITER): samples.append( self.agent.sample( self.task_hor, self.policy ) ) print("Rewards obtained:", [sample["reward_sum"] for sample in samples]) traj_obs.extend([sample["obs"] for sample in samples]) traj_acs.extend([sample["ac"] for sample in samples]) traj_rets.extend([sample["reward_sum"] for sample in samples]) traj_rews.extend([sample["rewards"] for sample in samples]) if(i % 50 == 0): self.model.save_models() test_results.append((i,self.test(20))) test_file = open("test_graph.txt","w") test_file.writelines([str(epoch) + "," + str(result) + "\n" for (epoch,result) in test_results]) test_file.close() self.policy.train( [sample["obs"] for sample in samples], [sample["ac"] for sample in samples], [sample["rewards"] for sample in samples] )