env = DtRewardWrapper(env)

# Set seeds
seed(args.seed)

state_dim = env.observation_space.shape
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

# Initialize policy
policy = DDPG(state_dim, action_dim, max_action, net_type="cnn")

replay_buffer = utils.ReplayBuffer(args.replay_buffer_max_size)

# Evaluate untrained policy
evaluations = [evaluate_policy(env, policy)]

exp.metric("rewards", evaluations[0])

total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True
episode_reward = None
env_counter = 0
while total_timesteps < args.max_timesteps:

    if done:

        if total_timesteps != 0:
            print(("Total T: %d Episode Num: %d Episode T: %d Reward: %f") %
Exemplo n.º 2
0
             action_dim,
             max_action,
             net_type=args.net_type,
             args=args)
if args.load_model:
    policy.load("TD3_2_best_start_0_best",
                "./pytorch_models/TD3_2_best_start_0")
    print("load suceed!")

if not args.priority_replay:
    replay_buffer = ReplayBuffer(args.replay_buffer_max_size)
else:
    replay_buffer = PriReplayMemory(args, args.replay_buffer_max_size)

# Evaluate untrained policy
evaluation = np.mean([evaluate_policy(e, policy) for e in envs])
evaluations = [evaluation]

exp.metric("rewards", evaluations[0])

total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True
episode_reward = None
env_counter = 0
best_eval_rew = -np.float("Inf")
best_eval_index = 0
env = envs[random.randint(0, len(envs) - 1)]
while total_timesteps < args.max_timesteps: