Exemplo n.º 1
0
 def init_population(self):
     if not self.population:
         for worker_id in range(args.population_size):
             h = Hyperparameter(worker_id)
             policy = OurDDPG.DDPG(state_dim,
                                   action_dim,
                                   max_action,
                                   ACTOR_LR=h.ACTOR_LEARNING_RATE,
                                   CRITIC_LR=h.CRITIC_LEARNING_RATE)
             worker = Worker(worker_id=worker_id, h=h, agent=policy)
             self.population.append(worker)
     for worker in self.population:
         self.worker_queue.put(worker.worker_id)
Exemplo n.º 2
0
Arquivo: show.py Projeto: llfl/TD3
def moduleShow(args):
    env = gym.make(args.env_name)
    state_dim = env.observation_space["observation"].shape[
        0] + env.observation_space["desired_goal"].shape[0]
    #state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    if args.policy_name == "TD3":
        policy = TD3.TD3(state_dim, action_dim, max_action)
    elif args.policy_name == "OurDDPG":
        policy = OurDDPG.DDPG(state_dim, action_dim, max_action)
    elif args.policy_name == "DDPG":
        policy = DDPG.DDPG(state_dim, action_dim, max_action)

    replay_buffer = utils.ReplayBuffer()

    # Evaluate untrained policy
    evaluations = [evaluate_policy(policy)]
    obs = env.reset()
Exemplo n.º 3
0
    print "---------------------------------------"
    print "Settings: %s" % (file_name)
    print "---------------------------------------"

    env = gym.make(args.env_name)

    # Set seeds
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    if args.policy_name == "TD3":
        policy = \
            TD3.TD3(state_dim, action_dim, max_action, actor_lr=args.actor_lr, is_ro=args.is_ro)
    elif args.policy_name == "OurDDPG":
        policy = \
            OurDDPG.DDPG(state_dim, action_dim, max_action, actor_lr=args.actor_lr, is_ro=args.is_ro)
    elif args.policy_name == "DDPG":
        policy = DDPG.DDPG(state_dim, action_dim, max_action)

    policy.load(
        "%s_%s_%s.pth" % (args.policy_name, args.env_name, str(args.seed)),
        "pytorch_models")

    evaluate_policy(policy, args.eval_episodes)
Exemplo n.º 4
0
        "state_dim": state_dim,
        "action_dim": action_dim,
        "max_action": max_action,
        "discount": args.discount,
        "tau": args.tau,
    }

    # Initialize policy
    if args.policy == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = args.policy_noise * max_action
        kwargs["noise_clip"] = args.noise_clip * max_action
        kwargs["policy_freq"] = args.policy_freq
        policy = TD3.Td3(**kwargs)
    elif args.policy == "OurDDPG":
        policy = OurDDPG.DDPG(**kwargs)
    elif args.policy == "DDPG":
        policy = DDPG.DDPG(**kwargs)

    if args.load_model != "":
        policy_file = file_name if args.load_model == "default" else args.load_model
        policy.load(f"./models/{policy_file}")

    replay_buffer = utils.ReplayBuffer(state_dim, action_dim)

    # Evaluate untrained policy
    evaluations = [eval_policy(policy, args.env, args.seed)]

    state, done = env.reset(), False
    episode_reward = 0
    episode_timesteps = 0
Exemplo n.º 5
0
Arquivo: main.py Projeto: yasasa/TD3
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    if args.policy_name == "TD3":
        policy = TD3.TD3(state_dim, action_dim, max_action)
    elif args.policy_name == "BNNTD3":
        policy = BNNTD3.TD3(state_dim, action_dim, max_action)
    elif args.policy_name == "BootstrapTD3":
        if args.actor_branches > 0:
            actor_branches = args.actor_branches
        else:
            actor_branches = args.branches
        policy = BootstrapTD3.TD3(state_dim, action_dim, max_action, args.branches, actor_branches)
    elif args.policy_name == "OurDDPG":
        policy = OurDDPG.DDPG(state_dim, action_dim, max_action)
    elif args.policy_name == "DDPG":
        policy = DDPG.DDPG(state_dim, action_dim, max_action)

    replay_buffer = utils.ReplayBuffer()

    # Evaluate untrained policy
    evaluations = [evaluate_policy(policy)]

    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    if args.actor_branches > 0:
        branches = args.actor_branches
    else:
        branches = args.branches
Exemplo n.º 6
0
def train(config, start_timesteps, max_timesteps, policy_noise, expl_noise,
          noise_clip, policy_freq, batch_size, seed, policy,
          prioritized_replay, env_name, eval_freq, discount, tau, use_rank):
    if prioritized_replay:
        alpha = float(config["alpha"])
        beta = float(config["beta"])
    else:
        discount = float(config["discount"])
        tau = float(config["tau"])

    import pybulletgym
    warnings.filterwarnings("ignore")
    env = gym.make(env_name)

    # Set seeds
    env.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    kwargs = {
        "state_dim": state_dim,
        "action_dim": action_dim,
        "max_action": max_action,
        "discount": discount,
        "tau": tau,
    }

    # Initialize policy
    if policy == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = policy_noise * max_action
        kwargs["noise_clip"] = noise_clip * max_action
        kwargs["policy_freq"] = policy_freq
        kwargs["prioritized_replay"] = prioritized_replay
        kwargs["use_rank"] = use_rank
        policy = TD3.TD3(**kwargs)
    elif policy == "OurDDPG":
        policy = OurDDPG.DDPG(**kwargs)
    elif policy == "DDPG":
        policy = DDPG.DDPG(**kwargs)

    if prioritized_replay:
        replay_buffer = utils.PrioritizedReplayBuffer(state_dim,
                                                      action_dim,
                                                      max_timesteps,
                                                      start_timesteps,
                                                      alpha=alpha,
                                                      beta=beta)
    else:
        replay_buffer = utils.ReplayBuffer(state_dim, action_dim)

    # Evaluate untrained policy
    evaluations = [eval_policy(policy, env_name, seed)]

    state, done = env.reset(), False
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0

    for t in range(int(max_timesteps)):

        episode_timesteps += 1
        # Select action randomly or according to policy
        if t < start_timesteps:
            action = env.action_space.sample()
        else:
            action = (policy.select_action(np.array(state)) + np.random.normal(
                0, max_action * expl_noise, size=action_dim)).clip(
                    -max_action, max_action)

        # Perform action
        next_state, reward, done, _ = env.step(action)
        done_bool = float(
            done) if episode_timesteps < env._max_episode_steps else 0

        # Store data in replay buffer
        replay_buffer.add(state, action, next_state, reward, done_bool)

        state = next_state
        episode_reward += reward

        # Train agent after collecting sufficient data
        if t >= start_timesteps:
            policy.train(replay_buffer, batch_size)

        if done:
            # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True
            print(
                f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}"
            )
            # Reset environment
            state, done = env.reset(), False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        # Evaluate episode
        if (t + 1) % eval_freq == 0:
            avg_reward = eval_policy(policy, env_name, seed)
            tune.report(episode_reward_mean=avg_reward)
            evaluations.append(avg_reward)
Exemplo n.º 7
0
		os.makedirs("./pytorch_models")

	env = gym.make(args.env_name)

	# Set seeds
	env.seed(args.seed)
	torch.manual_seed(args.seed)
	np.random.seed(args.seed)
	
	state_dim = env.observation_space.shape[0]
	action_dim = env.action_space.shape[0] 
	max_action = float(env.action_space.high[0])

	# Initialize policy
	if args.policy_name == "TD3": policy = TD3.TD3(state_dim, action_dim, max_action)
	elif args.policy_name == "OurDDPG": policy = OurDDPG.DDPG(state_dim, action_dim, max_action)
	elif args.policy_name == "DDPG": policy = DDPG.DDPG(state_dim, action_dim, max_action)

	replay_buffer = utils.ReplayBuffer()
	
	# Evaluate untrained policy
	evaluations = [evaluate_policy(policy)] 

	total_timesteps = 0
	timesteps_since_eval = 0
	episode_num = 0
	done = True 

	while total_timesteps < args.max_timesteps:
		
		if done: 
Exemplo n.º 8
0
def experiment(variant):
    print('CUDA status:', torch.cuda.is_available())
    env = make_env(variant['env'])

    # Set seeds
    variant['seed'] = int(variant['seed'])
    env.seed(int(variant['seed']))
    torch.manual_seed(int(variant['seed']))
    np.random.seed(int(variant['seed']))

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    kwargs = {"state_dim": state_dim, "action_dim": action_dim, "max_action": max_action,
              "discount": variant['discount'], "tau": variant['tau'],
              'network_class': NETWORK_CLASSES[variant['network_class']]}

    # custom network kwargs
    mlp_network_kwargs = dict(n_hidden=variant['n_hidden'],
                              hidden_dim=variant['hidden_dim'],
                              first_dim=variant['first_dim'])
    dropout_mlp_network_kwargs = dict(n_hidden=variant['n_hidden'],
                                      hidden_dim=variant['hidden_dim'],
                                      first_dim=variant['first_dim'],
                                      dropout_p=variant['dropout_p'])
    variable_init_mlp_network_kwargs = dict(n_hidden=variant['n_hidden'],
                                            hidden_dim=variant['hidden_dim'],
                                            first_dim=variant['first_dim'],
                                            sigma=variant['sigma'])
    fourier_network_kwargs = dict(n_hidden=variant['n_hidden'],
                                  hidden_dim=variant['hidden_dim'],
                                  fourier_dim=variant['fourier_dim'],
                                  sigma=variant['sigma'],
                                  concatenate_fourier=variant['concatenate_fourier'],
                                  train_B=variant['train_B'])
    siren_network_kwargs = dict(n_hidden=variant['n_hidden'],
                                hidden_dim=variant['hidden_dim'],
                                first_omega_0=variant['omega'],
                                hidden_omega_0=variant['omega'])
    if variant['network_class'] in {'MLP', 'D2RL', 'ConcatMLP', 'SpectralMLP'}:
        kwargs['network_kwargs'] = mlp_network_kwargs
    elif variant['network_class'] == 'DropoutMLP':
        kwargs['network_kwargs'] = dropout_mlp_network_kwargs
    elif variant['network_class'] == 'VariableInitMLP':
        kwargs['network_kwargs'] = variable_init_mlp_network_kwargs
    elif variant['network_class'] in {'FourierMLP', 'LogUniformFourierMLP'}:
        kwargs['network_kwargs'] = fourier_network_kwargs
    elif variant['network_class'] == 'Siren':
        kwargs['network_kwargs'] = siren_network_kwargs
    else:
        raise NotImplementedError

    # Initialize policy
    if variant['policy'] == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = variant['policy_noise * max_action']
        kwargs["noise_clip"] = variant['noise_clip * max_action']
        kwargs["policy_freq"] = variant['policy_freq']
        policy = TD3.TD3(**kwargs)
    elif variant['policy'] == "OurDDPG":
        policy = OurDDPG.DDPG(**kwargs)
    elif variant['policy'] == "DDPG":
        policy = DDPG.DDPG(**kwargs)
    elif variant['policy'] == "SAC":
        kwargs['lr'] = variant['lr']
        kwargs['alpha'] = variant['alpha']
        kwargs['automatic_entropy_tuning'] = variant['automatic_entropy_tuning']
        kwargs['weight_decay'] = variant['weight_decay']
        # left out dmc
        policy = SAC(**kwargs)
    elif 'PytorchSAC' in variant['policy']:
        kwargs['action_range'] = [float(env.action_space.low.min()), float(env.action_space.high.max())]
        kwargs['actor_lr'] = variant['lr']
        kwargs['critic_lr'] = variant['lr']
        kwargs['alpha_lr'] = variant['alpha_lr']
        kwargs['weight_decay'] = variant['weight_decay']
        kwargs['no_target'] = variant['no_target']
        kwargs['mlp_policy'] = variant['mlp_policy']
        kwargs['mlp_qf'] = variant['mlp_qf']
        del kwargs['max_action']
        if variant['policy'] == 'PytorchSAC':
            policy = PytorchSAC(**kwargs)
        elif variant['policy'] == 'RandomNoisePytorchSAC':
            kwargs['noise_dist'] = variant['noise_dist']
            kwargs['noise_scale'] = variant['noise_scale']
            policy = RandomNoiseSACAgent(**kwargs)
        elif variant['policy'] == 'SmoothedPytorchSAC':
            kwargs['n_critic_samples'] = variant['n_critic_samples']
            kwargs['noise_dist'] = variant['noise_dist']
            kwargs['noise_scale'] = variant['noise_scale']
            policy = SmoothedSACAgent(**kwargs)
        elif variant['policy'] == 'FuncRegPytorchSAC':
            kwargs['critic_target_update_frequency'] = variant['critic_freq']
            kwargs['fr_weight'] = variant['fr_weight']
            policy = FuncRegSACAgent(**kwargs)
    else:
        raise NotImplementedError

    if variant['load_model'] != "":
        raise RuntimeError

    # load replay buffer
    replay_buffer = torch.load(os.path.join(variant['replay_buffer_folder'], 'generated_replay_buffer.pt'))

    policy_optimizer = torch.optim.Adam(policy.actor.parameters(), lr=variant['lr'])
    qf_optimizer = torch.optim.Adam(policy.critic.Q1.parameters(), lr=variant['lr'])

    # split into train and val for both action and q_value
    indices = np.arange(replay_buffer.max_size)
    random.shuffle(indices)
    train_indices = indices[:int(0.9 * len(indices))]
    val_indices = indices[int(0.9 * len(indices)):]
    train_dataset = torch.utils.data.TensorDataset(torch.tensor(replay_buffer.state[train_indices]).float(),
                                                   torch.tensor(replay_buffer.action[train_indices]).float(),
                                                   torch.tensor(replay_buffer.correct_action[train_indices]).float(),
                                                   torch.tensor(replay_buffer.q_value[train_indices]).float())
    val_dataset = torch.utils.data.TensorDataset(torch.tensor(replay_buffer.state[val_indices]).float(),
                                                 torch.tensor(replay_buffer.action[val_indices]).float(),
                                                 torch.tensor(replay_buffer.correct_action[val_indices]).float(),
                                                 torch.tensor(replay_buffer.q_value[val_indices]).float())

    # train a network on it
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=variant['batch_size'], shuffle=True,
                                               pin_memory=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=variant['batch_size'], shuffle=True,
                                             pin_memory=True)

    train_q_losses = []
    train_policy_losses = []
    val_q_losses = []
    val_policy_losses = []
    for _ in trange(variant['n_train_epochs']):
        total_q_loss = 0
        total_policy_loss = 0
        for (state, action, correct_action, q) in train_loader:
            state = state.to(DEVICE)
            action = action.to(DEVICE)
            correct_action = correct_action.to(DEVICE)
            q = q.to(DEVICE)
            q_preds = policy.critic.Q1(torch.cat([state, action], dim=-1))
            policy_preds = policy.actor(state).mean
            q_loss = F.mse_loss(q_preds, q)
            policy_loss = F.mse_loss(policy_preds, correct_action)
            qf_optimizer.zero_grad()
            policy_optimizer.zero_grad()
            q_loss.backward()
            policy_loss.backward()
            qf_optimizer.step()
            policy_optimizer.step()
            total_q_loss += q_loss.item()
            total_policy_loss += policy_loss.item()

        # get validation stats
        total_val_q_loss = 0
        total_val_policy_loss = 0
        with torch.no_grad():
            for (state, action, correct_action, q) in val_loader:
                state = state.to(DEVICE)
                action = action.to(DEVICE)
                correct_action = correct_action.to(DEVICE)
                q = q.to(DEVICE)
                q_preds = policy.critic.Q1(torch.cat([state, action], dim=-1))
                policy_preds = policy.actor(state).mean
                q_loss = F.mse_loss(q_preds, q)
                policy_loss = F.mse_loss(policy_preds, correct_action)
                total_val_q_loss += q_loss.item()
                total_val_policy_loss += policy_loss.item()

        train_q_losses.append(total_q_loss / len(train_loader))
        train_policy_losses.append(total_policy_loss / len(train_loader))
        val_q_losses.append(total_val_q_loss / len(val_loader))
        val_policy_losses.append(total_val_policy_loss / len(val_loader))
        print(f'train: qf loss: {train_q_losses[-1]:.4f}, policy loss: {train_policy_losses[-1]:.4f}')
        print(f'val: qf loss: {val_q_losses[-1]:.4f}, policy loss: {val_policy_losses[-1]:.4f}')

    # evaluate the resulting policy for 100 episodes
    eval_return = eval_policy(policy, variant['env'], variant['seed'], eval_episodes=variant['eval_episodes'])

    # save the results
    to_save = dict(
        train_q_losses=train_q_losses,
        train_policy_losses=train_policy_losses,
        val_q_losses=val_q_losses,
        val_policy_losses=val_policy_losses,
        eval_return=eval_return,
        qf=policy.critic.Q1.state_dict(),
        policy=policy.actor.state_dict()
    )
    torch.save(to_save, os.path.join(variant['replay_buffer_folder'], f'{variant["network_class"]}_distillation.pt'))
Exemplo n.º 9
0
def train(config, args):
    if not os.path.exists("./results"):
        os.makedirs("./results")

    if args.save_model and not os.path.exists("./models"):
        os.makedirs("./models")

    import pybulletgym
    warnings.filterwarnings("ignore")
    eps_bounds = args.reacher_epsilon_bounds      # just aliasing with shorter variable name
    utils_object = utils.GeneralUtils(args)

    if args.tune_run:
        if args.prioritized_replay:
            args.alpha = float(config["alpha"])
            args.beta = float(config["beta"])
            args.discount = float(config.get("discount", args.discount))
            args.tau = float(config.get("tau", args.tau))
        elif args.custom_env and args.use_hindsight:
            eps_bounds = [float(config["epsilons"][0]), float(config["epsilons"][1])]
            args.seed = int(config["seed"])
        else:
            args.discount = float(config.get("discount", args.discount))
            args.tau = float(config.get("tau", args.tau))
    
    if args.custom_env:
        gym.envs.register(
            id='OurReacher-v0',
            entry_point='our_reacher_env:OurReacherEnv',
            max_episode_steps=50,
            reward_threshold=100.0,
        )

        # this is assuming we only use epsilon for custom env or fetch reach, where episode tsteps is 50 !!!!
        max_episode_steps = 50

        # retrieve epsilon range
        [a, b] = eps_bounds
        epsilons = utils_object.epsilon_calc(a, b, max_episode_steps)
        env = gym.make('OurReacher-v0', epsilon=epsilons[0], render=False)
    else:
        env = gym.make(args.env)

    if utils_object.fetch_reach and utils_object.args.fetch_reach_dense:
        env.env.reward_type = "dense"

    # Set seeds
    env.seed(int(args.seed))
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    if utils_object.fetch_reach:
        state_dim = env.reset()["observation"].shape[0]
    else:
        state_dim = env.observation_space.shape[0]
    if args.use_hindsight:          # include both current state and goal state
        if args.custom_env:
            state_dim += 2          # reacher nonsense; goal = (x, y)
        elif utils_object.fetch_reach:
            state_dim += 3          # include fetchreach goal state (x,y,z position)
        else:
            state_dim *= 2

    action_dim = env.action_space.shape[0] 
    max_action = float(env.action_space.high[0])

    kwargs = {
        "state_dim": state_dim,
        "action_dim": action_dim,
        "max_action": max_action,
        "discount": args.discount,
        "tau": args.tau,
    }

    # Initialize policy
    if args.policy == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = args.policy_noise * max_action
        kwargs["noise_clip"] = args.noise_clip * max_action
        kwargs["policy_freq"] = args.policy_freq
        kwargs["prioritized_replay"] = args.prioritized_replay
        kwargs["use_rank"] = args.use_rank
        kwargs["use_hindsight"] = args.use_hindsight
        
        policy = TD3.TD3(**kwargs)
    elif args.policy == "OurDDPG":
        policy = OurDDPG.DDPG(**kwargs)
    elif args.policy == "DDPG":
        policy = DDPG.DDPG(**kwargs)

    exp_descriptors = [
        args.policy, 'CustomReacher' if args.custom_env else args.env,
        f"{'rank' if args.use_rank else 'proportional'}PER" if args.prioritized_replay else '', 
        'HER' if args.use_hindsight else '',
        f"{args.decay_type}decay-eps{f'{eps_bounds[0]}-{eps_bounds[1]}' if eps_bounds[0] != eps_bounds[1] else f'{eps_bounds[0]}'}" if args.custom_env else "",
        f"k{args.k}",
        datetime.now().strftime('%Y%m%d%H%M')
    ]
    if args.tune_run:
        # fudgy: assumes tune_run for non-HER experiments
        exp_descriptors = [
            args.policy, 'CustomReacher' if args.custom_env else args.env,
            f"{'rank' if args.use_rank else 'proportional'}PER" if args.prioritized_replay else '', 
            f"tau{args.tau}", f"discount{args.discount}",
            f"alpha{args.alpha}" if args.prioritized_replay else '',
            f"beta{args.beta}" if args.prioritized_replay else '',
            f"k{args.k}",
            datetime.now().strftime('%Y%m%d%H%M')
        ]

    exp_descriptors = [x for x in exp_descriptors if len(x) > 0]
    file_name = "_".join(exp_descriptors)

    if args.load_model != "":
        policy_file = file_name if args.load_model == "default" else args.load_model
        policy.load(f"./models/{policy_file}")

    if args.prioritized_replay:
        replay_buffer = utils.PrioritizedReplayBuffer(state_dim, action_dim,
                                                      args.max_timesteps, args.start_timesteps,
                                                      alpha=args.alpha, beta=args.beta)
    else:
        replay_buffer = utils.ReplayBuffer(state_dim, action_dim)
    
    # Evaluate untrained policy
    evaluations = [eval_policy(policy, args.env, args.seed, utils_object=utils_object)]
 
    state, done = env.reset(), False

    original_episode_reward = 0
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0

    trajectory = []

    for t in range(int(args.max_timesteps)):
        
        episode_timesteps += 1
        x, goal = utils_object.compute_x_goal(state, env)
        
        # Select action randomly or according to policy
        if t < args.start_timesteps:
            action = env.action_space.sample()
        else:
            action = (
                policy.select_action(np.array(x))
                + np.random.normal(0, max_action * args.expl_noise, size=action_dim)
            ).clip(-max_action, max_action)

        # Perform action
        next_state, reward, done, _ = env.step(action)
        done_bool = float(done) if episode_timesteps < env._max_episode_steps else 0

        if args.use_hindsight:
            if utils_object.fetch_reach:
                goal = state["desired_goal"]
                next_x = np.concatenate([np.array(next_state["observation"]), goal])
            else:
                # env.set_goal(goal)
                next_x = np.concatenate([np.array(next_state), goal])
        elif utils_object.fetch_reach:
            next_x = np.array(next_state["observation"])
        else:
            next_x = next_state

        # Store data in replay buffer
        if not args.use_hindsight:
            replay_buffer.add(x, action, next_x, reward, done_bool)

        trajectory.append((state, action, next_state, reward, done_bool))

        state = next_state
        episode_reward += reward
        if args.custom_env:
          original_episode_reward += env.original_rewards

        # Train agent after collecting sufficient data
        if t >= args.start_timesteps:
            policy.train(replay_buffer, args.batch_size)

        if done:
            if args.use_hindsight:
                replay_buffer.add_hindsight(trajectory, goal, env, k=args.k, fetch_reach=utils_object.fetch_reach)
            # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True
            print(
                f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f} Original Reward: {original_episode_reward:.3f}")
            # Reset environment
            state, done = env.reset(), False
            episode_reward = 0
            original_episode_reward = 0
            episode_timesteps = 0
            episode_num += 1
            if args.custom_env:
                epsilon = epsilons[episode_num]
                env.set_epsilon(epsilon)

            trajectory = []

        # Evaluate episode
        if (t + 1) % args.eval_freq == 0:
            evaled_policy = eval_policy(policy, args.env, args.seed, utils_object=utils_object)
            evaluations.append(evaled_policy)
            np.save(f"./results/{file_name}", evaluations)
            if args.save_model:
                policy.save(f"./models/{file_name}")
            if args.plot:
                plotter.plot(file_name, args.custom_env)
            if args.tune_run:
                tune.report(episode_reward_mean=evaled_policy[0])
Exemplo n.º 10
0
def load_policy(load_from):
    # Initialize policy
    start_step = 0
    if args.policy == "TD3":
        import TD3
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = args.policy_noise * kwargs['max_action']
        kwargs["noise_clip"] = args.noise_clip * kwargs['max_action']
        kwargs["policy_freq"] = args.policy_freq
        policy = TD3.TD3(**kwargs)
    elif args.policy == "OurDDPG":
        import OurDDPG
        policy = OurDDPG.DDPG(**kwargs)
    elif args.policy == "DDPG":
        import DDPG
        policy = DDPG.DDPG(**kwargs)

    # create experiment directory (may not be used)
    exp_cnt = 0
    load_model_path = ''
    results_dir = os.path.join(args.savedir, args.exp_name+'%02d'%exp_cnt)
    while os.path.exists(results_dir):
        exp_cnt+=1
        results_dir = os.path.join(args.savedir, args.exp_name+'%02d'%exp_cnt)

    # load model if necessary
    if load_from != "":
        if os.path.isdir(load_from):
            print("loading latest model from dir: {}".format(load_from))
            # find last file
            search_path = os.path.join(load_from, '*.pt')
            model_files = glob(search_path)
            if not len(model_files):
                print('could not find model exp files at {}'.format(search_path))
                raise
            else:
                load_model_path = sorted(model_files)[-1]
        else:
            load_model_path = load_from
            print("loading model from file: {}".format(load_model_path))
        policy.load(load_model_path)
        # TODO 
        # utils.load_info_dict(load_model_base)
        try:
            start_step = int(load_model_path[-13:-3])
        except:
            try:
                start_step = policy.step
            except:
                print('unable to get start step from name - set it manually')

        # store in old dir
        if not args.continue_in_new_dir:
            results_dir = os.path.split(load_model_path)[0]
            print("continuing in loaded directory")
            print(results_dir)
        else:
            print("resuming in new directory")
            print(results_dir)
    else:
        if not os.path.exists(results_dir):
            os.makedirs(results_dir)
            print('storing results in: {}'.format(results_dir))
    return policy, start_step, results_dir, load_model_path
Exemplo n.º 11
0
    transfer_state_dim = 0
    transfer_action_dim = 0
    if args.transfer_env is not None:
        transfer_model = args.policy_name + "_" + args.transfer_env + "_" + str(
            args.seed)

        env_t = gym.make(args.transfer_env)
        transfer_state_dim = env_t.observation_space.shape[0]
        transfer_action_dim = env_t.action_space.shape[0]

    if args.policy_name == "TD3":
        policy = TD3.TD3(state_dim, action_dim, max_action, transfer_model,
                         transfer_state_dim, transfer_action_dim)
    elif args.policy_name == "OurDDPG":
        policy = OurDDPG.DDPG(state_dim, action_dim, max_action,
                              transfer_model, transfer_state_dim,
                              transfer_action_dim)

    replay_buffer = utils.ReplayBuffer()

    # Evaluate untrained policy
    evaluations = [evaluate_policy(policy)]

    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    done = True

    t0 = time.time()

    while total_timesteps < args.max_timesteps:
Exemplo n.º 12
0
    eval_envs = 100 + args.eval_episodes
    eval_env = make_vec_envs(args.env_name, eval_envs)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    if args.policy_name == 'TD3':
        args.swap_criterion = None

    # Initialize policy
    if args.policy_name == "TD3" or args.policy_name == 'TD3-swap':
        policy = TD3.TD3(state_dim, action_dim, 1, args.target_q, args.target_distance_weight)
    elif args.policy_name == "OurDDPG":
        policy = OurDDPG.DDPG(state_dim, action_dim, 1)
    elif args.policy_name == "DDPG":
        policy = DDPG.DDPG(state_dim, action_dim, 1)
    else:
        raise NotImplementedError

    replay_buffer = utils.ReplayBuffer()

    total_timesteps = 0
    total_timesteps_with_eval = 0
    timesteps_since_eval = 0
    timesteps_since_swapped = 0
    episode_num = 0
    done = True
    episode_reward = 0
    episode_timesteps = 0
Exemplo n.º 13
0
def experiment(variant):
    print('CUDA status:', torch.cuda.is_available())
    env = make_env(variant['env'])

    # Set seeds
    variant['seed'] = int(variant['seed'])
    env.seed(int(variant['seed']))
    torch.manual_seed(int(variant['seed']))
    np.random.seed(int(variant['seed']))

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    kwargs = {
        "state_dim": state_dim,
        "action_dim": action_dim,
        "max_action": max_action,
        "discount": variant['discount'],
        "tau": variant['tau'],
        'network_class': NETWORK_CLASSES[variant['network_class']]
    }

    # custom network kwargs
    mlp_network_kwargs = dict(n_hidden=variant['n_hidden'],
                              hidden_dim=variant['hidden_dim'],
                              first_dim=variant['first_dim'])
    dropout_mlp_network_kwargs = dict(n_hidden=variant['n_hidden'],
                                      hidden_dim=variant['hidden_dim'],
                                      first_dim=variant['first_dim'],
                                      dropout_p=variant['dropout_p'])
    variable_init_mlp_network_kwargs = dict(n_hidden=variant['n_hidden'],
                                            hidden_dim=variant['hidden_dim'],
                                            first_dim=variant['first_dim'],
                                            sigma=variant['sigma'])
    fourier_network_kwargs = dict(
        n_hidden=variant['n_hidden'],
        hidden_dim=variant['hidden_dim'],
        fourier_dim=variant['fourier_dim'],
        sigma=variant['sigma'],
        concatenate_fourier=variant['concatenate_fourier'],
        train_B=variant['train_B'])
    siren_network_kwargs = dict(n_hidden=variant['n_hidden'],
                                hidden_dim=variant['hidden_dim'],
                                first_omega_0=variant['omega'],
                                hidden_omega_0=variant['omega'])
    if variant['network_class'] in {'MLP', 'D2RL', 'ConcatMLP', 'SpectralMLP'}:
        kwargs['network_kwargs'] = mlp_network_kwargs
    elif variant['network_class'] == 'DropoutMLP':
        kwargs['network_kwargs'] = dropout_mlp_network_kwargs
    elif variant['network_class'] == 'VariableInitMLP':
        kwargs['network_kwargs'] = variable_init_mlp_network_kwargs
    elif variant['network_class'] in {'FourierMLP', 'LogUniformFourierMLP'}:
        kwargs['network_kwargs'] = fourier_network_kwargs
    elif variant['network_class'] == 'Siren':
        kwargs['network_kwargs'] = siren_network_kwargs
    else:
        raise NotImplementedError

    # Initialize policy
    if variant['policy'] == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = variant['policy_noise * max_action']
        kwargs["noise_clip"] = variant['noise_clip * max_action']
        kwargs["policy_freq"] = variant['policy_freq']
        policy = TD3.TD3(**kwargs)
    elif variant['policy'] == "OurDDPG":
        policy = OurDDPG.DDPG(**kwargs)
    elif variant['policy'] == "DDPG":
        policy = DDPG.DDPG(**kwargs)
    elif variant['policy'] == "SAC":
        kwargs['lr'] = variant['lr']
        kwargs['alpha'] = variant['alpha']
        kwargs['automatic_entropy_tuning'] = variant[
            'automatic_entropy_tuning']
        kwargs['weight_decay'] = variant['weight_decay']
        # left out dmc
        policy = SAC(**kwargs)
    elif 'PytorchSAC' in variant['policy']:
        kwargs['action_range'] = [
            float(env.action_space.low.min()),
            float(env.action_space.high.max())
        ]
        kwargs['actor_lr'] = variant['lr']
        kwargs['critic_lr'] = variant['lr']
        kwargs['alpha_lr'] = variant['alpha_lr']
        kwargs['weight_decay'] = variant['weight_decay']
        kwargs['no_target'] = variant['no_target']
        kwargs['mlp_policy'] = variant['mlp_policy']
        kwargs['mlp_qf'] = variant['mlp_qf']
        del kwargs['max_action']
        if variant['policy'] == 'PytorchSAC':
            policy = PytorchSAC(**kwargs)
        elif variant['policy'] == 'RandomNoisePytorchSAC':
            kwargs['noise_dist'] = variant['noise_dist']
            kwargs['noise_scale'] = variant['noise_scale']
            policy = RandomNoiseSACAgent(**kwargs)
        elif variant['policy'] == 'SmoothedPytorchSAC':
            kwargs['n_critic_samples'] = variant['n_critic_samples']
            kwargs['noise_dist'] = variant['noise_dist']
            kwargs['noise_scale'] = variant['noise_scale']
            policy = SmoothedSACAgent(**kwargs)
        elif variant['policy'] == 'FuncRegPytorchSAC':
            kwargs['critic_target_update_frequency'] = args.critic_freq
            kwargs['fr_weight'] = args.fr_weight
            policy = FuncRegSACAgent(**kwargs)
    else:
        raise NotImplementedError

    if variant['load_model'] != "":
        policy_file = variant['load_model']
        # policy_file = file_name if variant['load_model'] == "default" else variant['load_model']
        policy.load(policy_file)

    replay_buffer = CustomReplayBuffer(state_dim,
                                       action_dim,
                                       max_size=int(variant['max_timesteps']))

    # fill replay buffer, save immediately
    state, done = env.reset(), False
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0
    curr_time = datetime.now()
    for t in trange(int(variant['max_timesteps'])):
        episode_timesteps += 1
        action = policy.select_action(np.array(state), evaluate=False)

        # Perform action
        next_state, reward, done, _ = env.step(action)
        replay_buffer.add(state, action)
        state = next_state
        episode_reward += reward

        if done or episode_timesteps > env._max_episode_steps:
            # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True
            print(
                f"Total T: {t + 1} Episode Num: {episode_num + 1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}"
            )
            # Reset environment
            state, done = env.reset(), False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

    # save the replay buffer
    folder = os.path.dirname(policy_file)
    torch.save(replay_buffer, os.path.join(folder,
                                           'generated_replay_buffer.pt'))
    assert replay_buffer.max_size == replay_buffer.size

    # label the items in the replay buffer with my q-networks and policy
    with torch.no_grad():
        for start_idx in trange(0, replay_buffer.max_size,
                                variant['batch_size']):
            end_idx = start_idx + variant['batch_size']
            obs = torch.tensor(replay_buffer.state[start_idx:end_idx],
                               device=DEVICE,
                               dtype=torch.float32)
            action = torch.tensor(replay_buffer.action[start_idx:end_idx],
                                  device=DEVICE,
                                  dtype=torch.float32)
            actor_Q1, actor_Q2 = policy.critic(obs, action)
            actor_Q = torch.min(actor_Q1, actor_Q2)
            action = policy.actor(obs).mean.clamp(*policy.action_range)
            replay_buffer.set_values(start_idx, end_idx, to_np(actor_Q),
                                     to_np(action))

    # overwrite the bad replay buffer
    torch.save(replay_buffer, os.path.join(folder,
                                           'generated_replay_buffer.pt'))
Exemplo n.º 14
0
def main(env_name, seed, algo, idx):
    # algo: TD3, DDPG, OurDDPG
    # seed: int
    # env_name: str

    class args:
        policy_name = "algo"
        env_name = "env_name"
        seed = 0
        start_timesteps = int(1e4)
        eval_freq = int(5e3)
        max_timesteps = int(1e6)
        save_models = True
        expl_noise = 0.1
        batch_size = 100
        discount = 0.99
        tau = 0.005
        policy_noise = 0.2
        noise_clip = 0.5
        policy_freq = 2

    args.policy_name = algo
    args.env_name = env_name
    args.seed = seed

    file_name = "%s-%s-seed-%s--reward.csv" % (args.policy_name, args.env_name,
                                               str(args.seed))
    print("---------------------------------------")
    print("Settings: %s" % (file_name))
    print("---------------------------------------")

    if not os.path.exists("./results"):
        os.makedirs("./results")
    if args.save_models and not os.path.exists("./pytorch_models"):
        os.makedirs("./pytorch_models")

    env = gym.make(args.env_name)

    # Set seeds
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    if args.policy_name == "TD3":
        policy = TD3.TD3(state_dim, action_dim, max_action)
    elif args.policy_name == "OurDDPG":
        policy = OurDDPG.DDPG(state_dim, action_dim, max_action)
    elif args.policy_name == "DDPG":
        policy = DDPG.DDPG(state_dim, action_dim, max_action)

    replay_buffer = utils.ReplayBuffer()

    # Evaluate untrained policy
    evaluations = [evaluate_policy(env, policy)]

    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    done = True

    while total_timesteps < args.max_timesteps:

        if done:

            if total_timesteps != 0:
                print("Total T: %d Episode Num: %d Episode T: %d Reward: %f" %
                      (total_timesteps, episode_num, episode_timesteps,
                       episode_reward))
                if args.policy_name == "TD3":
                    policy.train(replay_buffer, episode_timesteps,
                                 args.batch_size, args.discount, args.tau,
                                 args.policy_noise, args.noise_clip,
                                 args.policy_freq)
                else:
                    policy.train(replay_buffer, episode_timesteps,
                                 args.batch_size, args.discount, args.tau)

            # Evaluate episode
            if timesteps_since_eval >= args.eval_freq:
                timesteps_since_eval %= args.eval_freq
                evaluations.append(evaluate_policy(env, policy))

                if args.save_models:
                    policy.save(file_name, directory="./pytorch_models")
                np.save("./results/%s" % (file_name), evaluations)

            # Reset environment
            obs = env.reset()
            done = False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        # Select action randomly or according to policy
        if total_timesteps < args.start_timesteps:
            action = env.action_space.sample()
        else:
            action = policy.select_action(np.array(obs))
            if args.expl_noise != 0:
                action = (action + np.random.normal(
                    0, args.expl_noise, size=env.action_space.shape[0])).clip(
                        env.action_space.low, env.action_space.high)

        # Perform action
        new_obs, reward, done, _ = env.step(action)
        done_bool = 0 if episode_timesteps + \
            1 == env._max_episode_steps else float(done)
        episode_reward += reward

        # Store data in replay buffer
        replay_buffer.add((obs, new_obs, action, reward, done_bool))

        obs = new_obs

        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1

    # Final evaluation
    evaluations.append(evaluate_policy(env, policy))
    if args.save_models:
        policy.save("%s" % (file_name), directory="./pytorch_models")
    np.save("./results/%s" % (file_name), evaluations)
    return True
Exemplo n.º 15
0
def experiment(variant):
    from rlkit_logging import logger
    print('CUDA status:', torch.cuda.is_available())
    env = make_env(variant['env'])

    # Set seeds
    variant['seed'] = int(variant['seed'])
    env.seed(int(variant['seed']))
    torch.manual_seed(int(variant['seed']))
    np.random.seed(int(variant['seed']))

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    kwargs = {
        "state_dim": state_dim,
        "action_dim": action_dim,
        "max_action": max_action,
        "discount": variant['discount'],
        "tau": variant['tau'],
        'network_class': NETWORK_CLASSES[variant['network_class']]
    }

    # custom network kwargs
    mlp_network_kwargs = dict(n_hidden=variant['n_hidden'],
                              hidden_dim=variant['hidden_dim'],
                              first_dim=variant['first_dim'])
    dropout_mlp_network_kwargs = dict(n_hidden=variant['n_hidden'],
                                      hidden_dim=variant['hidden_dim'],
                                      first_dim=variant['first_dim'],
                                      dropout_p=variant['dropout_p'])
    variable_init_mlp_network_kwargs = dict(n_hidden=variant['n_hidden'],
                                            hidden_dim=variant['hidden_dim'],
                                            first_dim=variant['first_dim'],
                                            sigma=variant['sigma'])
    fourier_network_kwargs = dict(
        n_hidden=variant['n_hidden'],
        hidden_dim=variant['hidden_dim'],
        fourier_dim=variant['fourier_dim'],
        sigma=variant['sigma'],
        concatenate_fourier=variant['concatenate_fourier'],
        train_B=variant['train_B'])
    siren_network_kwargs = dict(n_hidden=variant['n_hidden'],
                                hidden_dim=variant['hidden_dim'],
                                first_omega_0=variant['omega'],
                                hidden_omega_0=variant['omega'])
    if variant['network_class'] in {'MLP', 'D2RL', 'ConcatMLP', 'SpectralMLP'}:
        kwargs['network_kwargs'] = mlp_network_kwargs
    elif variant['network_class'] == 'DropoutMLP':
        kwargs['network_kwargs'] = dropout_mlp_network_kwargs
    elif variant['network_class'] == 'VariableInitMLP':
        kwargs['network_kwargs'] = variable_init_mlp_network_kwargs
    elif variant['network_class'] in {'FourierMLP', 'LogUniformFourierMLP'}:
        kwargs['network_kwargs'] = fourier_network_kwargs
    elif variant['network_class'] == 'Siren':
        kwargs['network_kwargs'] = siren_network_kwargs
    else:
        raise NotImplementedError

    # Initialize policy
    if variant['policy'] == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = variant['policy_noise * max_action']
        kwargs["noise_clip"] = variant['noise_clip * max_action']
        kwargs["policy_freq"] = variant['policy_freq']
        policy = TD3.TD3(**kwargs)
    elif variant['policy'] == "OurDDPG":
        policy = OurDDPG.DDPG(**kwargs)
    elif variant['policy'] == "DDPG":
        policy = DDPG.DDPG(**kwargs)
    elif variant['policy'] == "SAC":
        kwargs['lr'] = variant['lr']
        kwargs['alpha'] = variant['alpha']
        kwargs['automatic_entropy_tuning'] = variant[
            'automatic_entropy_tuning']
        kwargs['weight_decay'] = variant['weight_decay']
        # left out dmc
        policy = SAC(**kwargs)
    elif 'PytorchSAC' in variant['policy']:
        kwargs['action_range'] = [
            float(env.action_space.low.min()),
            float(env.action_space.high.max())
        ]
        kwargs['actor_lr'] = variant['lr']
        kwargs['critic_lr'] = variant['lr']
        kwargs['alpha_lr'] = variant['alpha_lr']
        kwargs['weight_decay'] = variant['weight_decay']
        kwargs['no_target'] = variant['no_target']
        kwargs['mlp_policy'] = variant['mlp_policy']
        kwargs['mlp_qf'] = variant['mlp_qf']
        del kwargs['max_action']
        if variant['policy'] == 'PytorchSAC':
            policy = PytorchSAC(**kwargs)
        elif variant['policy'] == 'RandomNoisePytorchSAC':
            kwargs['noise_dist'] = variant['noise_dist']
            kwargs['noise_scale'] = variant['noise_scale']
            policy = RandomNoiseSACAgent(**kwargs)
        elif variant['policy'] == 'SmoothedPytorchSAC':
            kwargs['n_critic_samples'] = variant['n_critic_samples']
            kwargs['noise_dist'] = variant['noise_dist']
            kwargs['noise_scale'] = variant['noise_scale']
            policy = SmoothedSACAgent(**kwargs)
        elif variant['policy'] == 'FuncRegPytorchSAC':
            kwargs['critic_target_update_frequency'] = args.critic_freq
            kwargs['fr_weight'] = args.fr_weight
            policy = FuncRegSACAgent(**kwargs)
    else:
        raise NotImplementedError

    if variant['load_model'] != "":
        policy_file = file_name if variant[
            'load_model'] == "default" else variant['load_model']
        policy.load(f"./models/{policy_file}")

    # change the kwargs for logging and plotting purposes
    kwargs['network_kwargs'] = {
        **mlp_network_kwargs,
        **dropout_mlp_network_kwargs,
        **fourier_network_kwargs,
        **siren_network_kwargs
    }
    kwargs['expID'] = variant['expID']
    kwargs['seed'] = variant['seed']
    kwargs['first_dim'] = max(variant['hidden_dim'], variant['first_dim'])
    kwargs['env'] = variant['env']

    # set up logging
    # log_dir = create_env_folder(args.env, args.expID, args.policy, args.network_class, test=args.test)
    # save_kwargs(kwargs, log_dir)
    # tabular_log_path = osp.join(log_dir, 'progress.csv')
    # text_log_path = osp.join(log_dir, 'debug.log')

    # logger.add_text_output(text_log_path)
    # logger.add_tabular_output(tabular_log_path)
    # exp_name = f'{args.env}-td3-exp{args.expID}'
    # logger.push_prefix("[%s] " % exp_name)
    policy.save(osp.join(logger.get_snapshot_dir(), f'itr0'))

    replay_buffer = utils.ReplayBuffer(state_dim, action_dim)

    # Evaluate untrained policy
    evaluations = [eval_policy(policy, variant['env'], variant['seed'])]

    state, done = env.reset(), False
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0
    curr_time = datetime.now()

    for t in range(int(variant['max_timesteps'])):
        episode_timesteps += 1

        # Select action randomly or according to policy
        if t < variant['start_timesteps']:
            action = env.action_space.sample()
        elif variant['policy'] in {'TD3', 'DDPG', 'OurDDPG'}:
            action = (policy.select_action(np.array(state), evaluate=False) +
                      np.random.normal(0,
                                       max_action * variant['expl_noise'],
                                       size=action_dim)).clip(
                                           -max_action, max_action)
        elif variant['policy'] in {
                'SAC', 'PytorchSAC', 'RandomNoisePytorchSAC',
                'SmoothedPytorchSAC', 'FuncRegPytorchSAC'
        }:
            action = policy.select_action(np.array(state), evaluate=False)

        # Perform action
        next_state, reward, done, _ = env.step(action)
        done_bool = float(
            done) if episode_timesteps < env._max_episode_steps else 0

        # Store data in replay buffer
        replay_buffer.add(state, action, next_state, reward, done_bool)

        state = next_state
        episode_reward += reward

        # Train agent after collecting sufficient data
        if t >= variant['start_timesteps']:
            policy.train_mode(training=True)
            policy.train(replay_buffer, variant['batch_size'])
            policy.train_mode(training=False)

        if done or episode_timesteps > env._max_episode_steps:
            # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True
            print(
                f"Total T: {t + 1} Episode Num: {episode_num + 1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}"
            )
            # Reset environment
            state, done = env.reset(), False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        # Evaluate episode
        if (t + 1) % variant['eval_freq'] == 0:
            evaluations.append(
                eval_policy(policy, variant['env'], variant['seed']))
            new_time = datetime.now()
            time_elapsed = (new_time - curr_time).total_seconds()
            curr_time = new_time

            logger.record_tabular('Timestep', t)
            logger.record_tabular('Eval returns', evaluations[-1])
            logger.record_tabular('Time since last eval (s)', time_elapsed)
            logger.dump_tabular(with_prefix=False, with_timestamp=False)
            if (t + 1) % 250000 == 0:
                policy.save(osp.join(logger.get_snapshot_dir(), f'itr{t + 1}'))
    policy.save(osp.join(
        logger.get_snapshot_dir(),
        f'final'))  # might be unnecessary if everything divides properly
Exemplo n.º 16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--policy_name", default="TD3")  # Policy name
    parser.add_argument("--env_name", default="Reacher-v2")
    parser.add_argument("--seed", default=0,
                        type=int)  # Sets Gym, PyTorch and Numpy seeds
    parser.add_argument(
        "--start_timesteps", default=1e3,
        type=int)  # How many time steps purely random policy is run for
    parser.add_argument("--eval_freq", default=5e3,
                        type=float)  # How often (time steps) we evaluate
    parser.add_argument("--max_timesteps", default=1e6,
                        type=float)  # Max time steps to run environment for
    parser.add_argument("--save_models",
                        action="store_true")  # Whether or not models are saved
    parser.add_argument("--expl_noise", default=0.1,
                        type=float)  # Std of Gaussian exploration noise
    parser.add_argument("--batch_size", default=100,
                        type=int)  # Batch size for both actor and critic
    parser.add_argument("--discount", default=0.99,
                        type=float)  # Discount factor
    parser.add_argument("--tau", default=0.005,
                        type=float)  # Target network update rate
    parser.add_argument(
        "--policy_noise", default=0.2,
        type=float)  # Noise added to target policy during critic update
    parser.add_argument("--noise_clip", default=0.5,
                        type=float)  # Range to clip target policy noise
    parser.add_argument("--policy_freq", default=2,
                        type=int)  # Frequency of delayed policy updates
    args = parser.parse_args()

    file_name = "%s_%s_%s" % (args.policy_name, args.env_name, str(args.seed))
    print("---------------------------------------")
    print("Settings: %s" % (file_name))
    print("---------------------------------------")

    for dirname in ("./results", "./rewards"):
        os.makedirs(dirname, exist_ok=True)
    if args.save_models:
        os.makedirs("./pytorch_models", exist_ok=True)

    unity = UnityEnvironment(file_name=executable(), no_graphics=True)
    env = UnityWrapper(unity, train_mode=True)

    # Set seeds
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    state_dim = env.observation_space
    action_dim = env.action_space
    max_action = 1

    # Initialize policy
    if args.policy_name == "TD3":
        policy = TD3.TD3(state_dim, action_dim, max_action)
    elif args.policy_name == "OurDDPG":
        policy = OurDDPG.DDPG(state_dim, action_dim, max_action)
    elif args.policy_name == "DDPG":
        policy = DDPG.DDPG(state_dim, action_dim, max_action)
    elif args.policy_name == "mDDPG":
        policy = mDDPG.DDPG(state_dim, action_dim, max_action)

    replay_buffer = utils.ReplayBuffer()

    # Evaluate untrained policy
    evaluations = [evaluate_policy(env, policy).mean()]

    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    episode_reward = 0
    episode_timesteps = 0
    done = True
    rewards = []

    while total_timesteps < args.max_timesteps:
        if done:
            if total_timesteps != 0:
                print(
                    ("Total T: %d Episode Num: %d Episode T: %d Reward: %f") %
                    (total_timesteps, episode_num, episode_timesteps,
                     episode_reward))
                rewards.append(episode_reward)
                if args.policy_name == "TD3":
                    policy.train(
                        replay_buffer,
                        episode_timesteps,
                        args.batch_size,
                        args.discount,
                        args.tau,
                        args.policy_noise,
                        args.noise_clip,
                        args.policy_freq,
                    )
                else:
                    policy.train(replay_buffer, episode_timesteps,
                                 args.batch_size, args.discount, args.tau)

            # Evaluate episode
            if timesteps_since_eval >= args.eval_freq:
                timesteps_since_eval %= args.eval_freq
                evaluations.append(evaluate_policy(env, policy).mean())

                if args.save_models:
                    policy.save(file_name, directory="./pytorch_models")
                np.save("./results/%s" % (file_name), evaluations)

            # Reset environment
            obs = env.reset()
            done = False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        # Select action randomly or according to policy
        if total_timesteps < args.start_timesteps:
            action = env.sample()
        else:
            action = policy.select_action(np.array(obs))
            if args.expl_noise != 0:
                action = (action + np.random.normal(
                    0, args.expl_noise, size=env.action_space)).clip(
                        env.action_space_low, env.action_space_high)

        # Perform action
        new_obs, reward, done, _ = env.step(action)
        done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(
            done)
        episode_reward += reward

        # Store data in replay buffer
        replay_buffer.add((obs, new_obs, action, reward, done_bool))

        obs = new_obs

        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1

    # Final evaluation
    evaluations.append(evaluate_policy(env, policy, 100).mean())
    if args.save_models:
        policy.save("%s" % (file_name), directory="./pytorch_models")
    np.save("./results/%s" % (file_name), evaluations)
    np.save("./rewards/%s" % (file_name), rewards)