Exemplo n.º 1
0
def main(args):

    # create env
    env = gym.make(args.env_name)
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    # follow different logic depending on action space of env
    hidden_size = args.hidden_size

    if args.action_space == "continuous":
        # get env info
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space
        max_action = (env.action_space.high)
        min_action = (env.action_space.low)

        print("number of actions:{0}, dim of states: {1},\
          max_action: {2}, min_action: {3}"                                           .format(action_dim,\
                                                  state_dim,max_action,min_action))

        # create policy
        policy = Actor_Critic(state_dim, hidden_size,\
        action_dim, baseline = args.baseline)

    elif args.action_space == "discrete":
        # get env info
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n

        print("number of actions: {0}, dim of states: {1},\
        ".format(action_dim, state_dim))

        # create policy
        policy = Actor_Critic_discrete(state_dim, hidden_size,\
                                    action_dim, baseline = args.baseline)

    else:
        raise NotImplementedError

    # setup comet_ml to track experiments
    if os.path.isfile("settings.json"):
        with open('settings.json') as f:
            data = json.load(f)
        args.comet_apikey = data["apikey"]
        args.comet_username = data["username"]
    else:
        raise NotImplementedError
    experiment = Experiment(api_key=args.comet_apikey,\
    project_name="simple_policy_gradient",auto_output_logging="None",\
    workspace=args.comet_username,auto_metric_logging=False,\
    auto_param_logging=False)
    experiment.set_name(args.namestr)
    args.experiment = experiment

    # start of experiment: Keep looping until desired amount of episodes reached
    max_episodes = args.num_episodes
    total_episodes = 0  # keep track of amount of episodes that we have done

    while total_episodes < max_episodes:

        obs = env.reset()
        done = False
        trajectory = []  # trajectory info for reinforce update
        episode_reward = 0  # keep track of rewards per episode

        while not done:
            action, ln_prob = policy.select_action(np.array(obs))
            next_state, reward, done, _ = env.step(action)
            trajectory.append(
                [np.array(obs), action, ln_prob, reward, next_state, done])

            obs = next_state
            episode_reward += reward

        total_episodes += 1

        # update actor/policy and critic/value_network
        policy_loss, value_loss = policy.train(trajectory)
        experiment.log_metric("value function loss",
                              value_loss,
                              step=total_episodes)

        experiment.log_metric("policy loss", policy_loss, step=total_episodes)
        experiment.log_metric("episode reward",
                              episode_reward,
                              step=total_episodes)

        if total_episodes % 10 == 0:
            evaluate_policy(policy, env)

        env.close()
Exemplo n.º 2
0
env = gym.envs.make(env_name)
MAX_ACTION = env.action_space.high
MIN_ACTION = env.action_space.low

ob_dim = env.observation_space.sample().shape[0]
ac_dim = env.action_space.sample().shape[0]

# MLP function approximators
pnet = MLP(2 * ac_dim, pnet_hparams)
vnet = MLP(1, vnet_hparams)

# actor and critic networks/training graphs in TF
actor = TF_CPolicy(pnet, ob_dim, ac_dim, hparams=actor_hparams,
                   min_val=MIN_ACTION, max_val=MAX_ACTION)
critic = TF_Value(vnet, ob_dim, hparams=critic_hparams)

# change structure of reward fn for car env
if env_name == "MountainCarContinuous-v0":
    def distance_reward(env, reward):
        return reward - np.abs(env.goal_position - env.state[0])
    reward_fn = distance_reward
else:
    reward_fn = None

# train and run actor critic
ac = Actor_Critic(env, actor, critic, hparams=ac_hparams,
                  reward_fn=reward_fn)
ac.train(video=False)
for _ in range(5):
    ac.do_episode(video=True)