def test(arglist):
    env_name = arglist.env
    random_seed = arglist.test_seed
    n_episodes = arglist.n_episodes
    lr = 0.002
    max_timesteps = 3000
    render = arglist.render

    if not arglist.ensemble:
        filename = "{}_{}_{}".format(arglist.policy, env_name,
                                     arglist.train_seed)
        directory = "./train/{}".format(env_name)
    else:
        filename = "{}_{}_{}_ensemble".format(arglist.policy, env_name,
                                              arglist.train_seed)
        directory = "./train/{}".format(env_name)

    #env = gym.make(env_name)
    env = gen_envs(arglist)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Set random seed
    env.seed(random_seed)
    torch.manual_seed(random_seed)
    np.random.seed(random_seed)

    kwargs = {
        "state_dim": state_dim,
        "action_dim": action_dim,
        "max_action": max_action,
        "discount": 0.99,
        "tau": 0.005,
        "policy_noise": 0.001,
        "noise_clip": 1.0,
        "policy_freq": 2
    }
    policy = TD3.TD3(**kwargs)
    policy.load(os.path.join(directory, filename))

    total_reward_list = []
    for ep in range(1, n_episodes + 1):
        ep_reward = 0.0
        state = env.reset()
        for t in range(max_timesteps):
            action = policy.select_action(state)
            state, reward, done, _ = env.step(action)
            ep_reward += reward
            if render:
                env.render()
            if done:
                break

        #print('Episode: {}\tReward: {}'.format(ep, int(ep_reward)))
        total_reward_list.append(ep_reward)
        ep_reward = 0.0
    env.close()
    return total_reward_list
Пример #2
0
def get_policy(arglist, kwargs, max_action):
    # Initialize policy
    if arglist.policy == "td3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = 0.0
        kwargs["noise_clip"] = 0.0
        kwargs["policy_freq"] = 2
        policy = TD3.TD3(**kwargs)
    elif arglist.policy == "OurDDPG":
        policy = OurDDPG.DDPG(**kwargs)
    elif arglist.policy == "DDPG":
        policy = DDPG.DDPG(**kwargs)
    elif arglist.policy == 'adv':
        kwargs['alpha'] = 0.01
        kwargs['adv_epsilon'] = 0.01
        kwargs[
            'logdir'] = f'./tensorboard/{arglist.policy}_{arglist.env}_{arglist.train_seed}/'
        policy = TD3_adv2.TD3(**kwargs)
    else:
        raise NotImplementedError
    return policy
Пример #3
0
tau = 0.005  # Target network update rate
policy_noise = 0.2  # STD of Gaussian noise added to the actions for the exploration purposes
noise_clip = 0.5  # Maximum value of the Gaussian noise added to the actions (policy)
policy_freq = 2  #
total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0

episode_reward=0
t0 = time.time()
distance_travelled=0
max_episode_steps = 1000
done = True # Episode over
load_model=True # Inference. Set to false for training from scratch

state_dim = 4
action_dim = 1
max_action = 5

replay_buffer = ReplayBuffer()
policy = TD3(state_dim, action_dim, max_action)

obs=np.array([])
new_obs=np.array([])
evaluations=[]

if load_model == True:
    total_timesteps = max_timesteps
    policy.load("%s" % (file_name), directory="./pytorch_models")

CarApp().run()
    kwargs = {
        "state_dim": state_dim,
        "action_dim": action_dim,
        "max_action": max_action,
        "discount": args.discount,
        "tau": args.tau,
    }

    # Initialize policy
    if args.policy == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = args.policy_noise
        kwargs["noise_clip"] = args.noise_clip
        kwargs["policy_freq"] = args.policy_freq
        policy = TD3.TD3(**kwargs)
    elif args.policy == "DDPG":
        policy = DDPG.DDPG(**kwargs)

    if args.load_model != "":
        policy_file = file_name if args.load_model == "default" else args.load_model
        policy.load(f"./checkpoint/{policy_file}")

    replay_buffer = ReplayBuffer(state_dim, action_dim)

    # Evaluate untrained policy
    evaluations = []
    # evaluations = [eval_policy(policy, env, args.seed, group_name)]

    # state, done = env.reset(group_name), False
    episode_reward = 0
Пример #5
0
    batch_size = 256  # Size of the batch
    discount = 0.90  # Discount factor gamma, used in the calculation of the total discounted reward
    polyak = 0.5  # Target network update rate
    policy_noise = 0.02  # STD of Gaussian noise added to the actions for the exploration purposes during model training
    noise_clip = 0.5  # Maximum value of the Gaussian noise added to the actions (policy)
    policy_freq = 2  # Number of iterations to wait before the policy network (Actor model) is updated
    actor_lr = 0.0001
    critic_lr = 0.0001

    # Loading the model
    policy = TD3(action_dim,
                 max_action,
                 batch_size=batch_size,
                 discount=discount,
                 polyak=polyak,
                 policy_noise=policy_noise,
                 noise_clip=noise_clip,
                 policy_freq=policy_freq,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 device=device)
    policy.load(file_name, './pytorch_models/')

    avg_reward = evaluate_policy(policy, env, eval_episodes=3)

    # Wrapup recording
    env.close()
    env.stats_recorder.save_complete()
    env.stats_recorder.done = True

    # # Recording in car view
Пример #6
0
    updateRate = 0.005
    policyNoise = 0.2
    noiseClip = 0.5
    policyFreq = 2
    totalTimeSteps = 0
    timeStepsFromEval = 0
    episodeNo = 0
    episodeReward = 0
    maxEpisodeSteps = 1000
    done = True

    stateDim = 4
    actionDim = 1
    maxAction = 5
    replayBuffer = ReplayBuffer()
    brain = TD3(stateDim, actionDim, maxAction)

    observation = np.array([])
    newobservation = np.array([])
    evaluations = []

    # loading model
    if testCar == True:
        print("### Model loaded ###")
        totalTimeSteps = maximumTimeSteps
        brain.load("%s" % (file_name), directory="./pytorch_models")

    parent = Environment()
    startTicks = pygame.time.get_ticks()

    while True:
Пример #7
0
def train(env_name, warmup_iter=int(25e3), train_iter=int(1e6)):

    env = gym.make(env_name)
    x_dim = env.observation_space.shape[0]
    u_dim = env.action_space.shape[0]
    u_max = float(env.action_space.high[0])
    td3 = TD3(x_dim, u_dim, u_max)
    if torch.cuda.is_available():
        td3.actor.cuda()
        td3.critic_1.cuda()
        td3.critic_2.cuda()

    print('pre_train: ', eval(env_name, td3.actor))
    x = env.reset()
    for i in range(warmup_iter):
        u_random = env.action_space.sample()
        x_next, reward, done, info = env.step(u_random)

        td3.replay_buffer.append([x, u_random, x_next, reward, done])

        #td3.add_replay_buffer_sample(x, u_random, x_next, reward, done)
        #print(x.shape, x_next.shape, x_dim)
        #step = torch.tensor([x, u_random, x_next, reward, float(done)]).view(1, 5)

        #td3.replay_buffer = torch.cat((td3.replay_buffer, step), dim=1)
        #td3.replay_buffer.append([x, u_random, x_next, reward, done])
        x = x_next

        if done:
            x = env.reset()
            done = False

            #ep_reward = 0
            #ep_steps = 0
            #ep_num += 1

    for i in range(train_iter):
        if i % 1e4 == 0:
            print('i: ', i, eval(env_name, td3.actor))
        if i % 1e5 == 0:
            torch.save(td3.actor.state_dict(),
                       f'td3_hop_actor_{time.strftime(r"%m_%d_%H:%M")}.pt')
            torch.save(td3.critic_1.state_dict(),
                       f'td3_hop_critic_1_{time.strftime(r"%m_%d_%H:%M")}.pt')
            torch.save(td3.critic_2.state_dict(),
                       f'td3_hop_critic_2_{time.strftime(r"%m_%d_%H:%M")}.pt')

        u = td3.actor(torch.tensor(x).float())
        u_noise = torch.normal(0, .1 * td3.u_max, size=u.shape)
        u = (u + u_noise).clip(-td3.u_max, td3.u_max).detach().cpu().numpy()

        x_next, reward, done, info = env.step(u)
        td3.replay_buffer.append([x, u, x_next, reward, done])
        x = x_next

        batch = td3.sample_replay_buffer()
        critic_loss = td3.critic_loss(batch)

        td3.critic_optim.zero_grad()
        critic_loss.backward()
        td3.critic_optim.step()

        #print(critic_loss.item())
        # Delayed model updates
        if i % td3.policy_update_period == 0:

            actor_loss = td3.actor_loss(batch)

            td3.actor_optim.zero_grad()
            actor_loss.backward()
            td3.actor_optim.step()

            for target_param, cr_param in zip(td3.critic_1_target.parameters(),
                                              td3.critic_1.parameters()):
                target_param.data.copy_(td3.tau * cr_param.data +
                                        (1 - td3.tau) * target_param.data)

            for target_param, cr_param in zip(td3.critic_2_target.parameters(),
                                              td3.critic_2.parameters()):
                target_param.data.copy_(td3.tau * cr_param.data +
                                        (1 - td3.tau) * target_param.data)

            for target_param, actor_param in zip(td3.actor_target.parameters(),
                                                 td3.actor.parameters()):
                target_param.data.copy_(td3.tau * actor_param.data +
                                        (1 - td3.tau) * target_param.data)

        if done:
            x = env.reset()
            done = False

    print('train end: ', eval(env_name, td3.actor))
    return td3