示例#1
0
    def __init__(self,
                 alpha,
                 beta,
                 input_dims,
                 tau,
                 env,
                 env_id,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 layer_1_size=256,
                 layer_2_size=256,
                 batch_size=100,
                 reward_scale=2):

        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)

        self.batch_size = batch_size
        self.n_actions = n_actions
        self.scale = reward_scale
        self.actor = ActorNetwork(alpha,
                                  input_dims,
                                  layer_1_size,
                                  layer_2_size,
                                  n_actions=n_actions,
                                  name=env_id + '_actor',
                                  max_action=env.action_space.high)

        self.critic_1 = CriticNetwork(beta,
                                      input_dims,
                                      layer_1_size,
                                      layer_2_size,
                                      n_actions=n_actions,
                                      name=env_id + '_critic_1')

        self.critic_2 = CriticNetwork(beta,
                                      input_dims,
                                      layer_1_size,
                                      layer_2_size,
                                      n_actions=n_actions,
                                      name=env_id + '_critic_2')

        self.value = ValueNetwork(beta,
                                  input_dims,
                                  layer_1_size,
                                  layer_2_size,
                                  name=env_id + '_value')

        self.target_value = ValueNetwork(beta,
                                         input_dims,
                                         layer_1_size,
                                         layer_2_size,
                                         name=env_id + '_target_value')

        self.update_network_parameters(tau=1)
示例#2
0
文件: agent.py 项目: mfsuve/TORCS-RL
    def __init__(self, load_from=None, will_train=True):
        self.env = TorcsEnv(
            path='/usr/local/share/games/torcs/config/raceman/quickrace.xml')
        self.args = SAC_args()
        self.buffer = ReplayBuffer(self.args.buffer_size)

        action_dim = self.env.action_space.shape[0]
        state_dim = self.env.observation_space.shape[0]
        hidden_dim = 256

        self.action_size = action_dim
        self.state_size = state_dim

        self.value_net = ValueNetwork(state_dim,
                                      hidden_dim).to(self.args.device)
        self.target_value_net = ValueNetwork(state_dim,
                                             hidden_dim).to(self.args.device)

        self.soft_q_net1 = SoftQNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.args.device)
        self.soft_q_net2 = SoftQNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.args.device)

        self.policy_net = PolicyNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.args.device)

        self.target_value_net.load_state_dict(self.value_net.state_dict())

        self.value_criterion = nn.MSELoss()
        self.soft_q_loss1 = nn.MSELoss()
        self.soft_q_loss2 = nn.MSELoss()

        self.value_opt = optim.Adam(self.value_net.parameters(),
                                    lr=self.args.lr)
        self.soft_q_opt1 = optim.Adam(self.soft_q_net1.parameters(),
                                      lr=self.args.lr)
        self.soft_q_opt2 = optim.Adam(self.soft_q_net2.parameters(),
                                      lr=self.args.lr)
        self.policy_opt = optim.Adam(self.policy_net.parameters(),
                                     lr=self.args.lr)

        if will_train:
            current_time = time.strftime('%d-%b-%y-%H.%M.%S', time.localtime())
            self.plot_folder = f'plots/{current_time}'
            self.model_save_folder = f'model/{current_time}'
            make_sure_dir_exists(self.plot_folder)
            make_sure_dir_exists(self.model_save_folder)
            self.cp = Checkpoint(self.model_save_folder)

        if load_from is not None:
            try:
                self.load_checkpoint(load_from)
            except FileNotFoundError:
                print(f'{load_from} not found. Running default.')
        else:
            print('Starting from scratch.')
示例#3
0
  def __init__(self, config):
    self.config = config

    # Create session
    self.session = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(
        allow_growth=True)))

    # Create networks
    self.prior_network = PolicyNetwork(
        scope=config.prior_network,
        temperature=config.prior_temperature,
        use_symmetry=config.use_symmetry)

    self.rollout_network = PolicyNetwork(
        scope=config.rollout_network,
        temperature=config.rollout_temperature,
        reuse=config.prior_network == config.rollout_network,
        use_symmetry=config.use_symmetry)

    self.value_network = ValueNetwork(
        scope=config.value_network, use_symmetry=config.use_symmetry)

    # Load networks from checkpoints
    run_dir = util.run_directory(config)
    util.restore_network_or_fail(self.session, run_dir, self.prior_network)
    util.restore_network_or_fail(self.session, run_dir, self.rollout_network)
    util.restore_network_or_fail(self.session, run_dir, self.value_network)

    # Create queues
    self.prior_queue = AllQueue()
    self.rollout_queue = AllQueue(maxsize=16)
    self.value_queue = AllQueue(maxsize=16)

    self.new_game()
示例#4
0
def train():
    tf.reset_default_graph()
    # Create a global step variable
    global_step = tf.Variable(0, name="global_step", trainable=False)

    policy_estimator = PolicyNetwork(input_size=len(env.state), output_size=env.action_space.n,
                                     summaries_dir=experiment_dir)
    value_estimator = ValueNetwork(input_size=len(env.state), output_size=1)
    # Object-aware Reward Network
    reward_estimator = ObjectAwareRewardNetwork(input_size=len(env.state), output_size=1, action_num=env.action_space.n)
    # # Reward Network
    # reward_estimator = RewardNetwork(input_size=len(env.state), output_size=1, action_num=env.action_space.n)

    saver = tf.train.Saver()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        sess.run(tf.initialize_all_variables())
        reinforce(env,
                  policy_estimator,
                  value_estimator,
                  reward_estimator,
                  max_num_episodes,
                  sess,
                  discount_factor=0.99,
                  uniform_sample=False,
                  saver=saver,
                  model_dir=model_dir,
                  figure_dir=figure_dir)
示例#5
0
    def __init__(self, alpha = 0.0003, beta = 0.0003, input_dims = [8],
                 env = None, gamma = 0.99, tau = 0.005, n_actions = 2, max_size = 1000000,
                 layer1_size = 256, layer2_size = 256, batch_size = 256, reward_scale = 2):
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.n_actions = n_actions
        self.scale = reward_scale

        self.memory = ReplayBuffer(max_size, input_dims, n_actions = n_actions)
        self.actor = ActorNetwork(alpha, input_dims, n_actions = n_actions, max_action = env.action_space.high)
        self.critic1 = CriticNetwork(beta, input_dims, n_actions = n_actions, name = 'critic1')
        self.critic2 = CriticNetwork(beta, input_dims, n_actions = n_actions, name = 'critic2')
        self.value = ValueNetwork(beta, input_dims, name = 'value')
        self.target_value = ValueNetwork(beta, input_dims, name = 'target')
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

        self.update_network_params(tau = 1)
示例#6
0
    def __init__(self,
                 alpha=0.0003,
                 beta=.0003,
                 input_dims=[8],
                 env=None,
                 gamma=.99,
                 n_actions=2,
                 max_size=1000000,
                 layer1_size=256,
                 layer2_size=256,
                 tau=.005,
                 batch_size=256,
                 reward_scale=2):
        # reward scales  depends on action convention for the environment
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions
        # set up classes
        self.actor = ActorNetwork(alpha,
                                  input_dims,
                                  max_action=env.action_space.high,
                                  n_actions=n_actions,
                                  name='actor')

        self.critic1 = CriticNetwork(beta,
                                     input_dims,
                                     n_actions=n_actions,
                                     name='critic_1')
        self.critic2 = CriticNetwork(beta,
                                     input_dims,
                                     n_actions=n_actions,
                                     name='critic_2')

        self.value = ValueNetwork(beta, input_dims, name='value')
        # target value
        self.target_value = ValueNetwork(beta, input_dims, name='target_value')
        self.scale = reward_scale
        self.update_network_parameters(tau=1)
示例#7
0
    def __init__(self, config):
        self.config = config
        self.run_dir = util.run_directory(config)
        self.position_targets = PositionTargets(config, self.run_dir)

        self.session = tf.Session(config=tf.ConfigProto(
            gpu_options=tf.GPUOptions(allow_growth=True)))

        self.value_network = ValueNetwork('value')
        util.restore_or_initialize_network(self.session, self.run_dir,
                                           self.value_network)

        # Train ops
        self.create_train_op(self.value_network)
        self.writer = tf.summary.FileWriter(self.run_dir)
        util.restore_or_initialize_scope(self.session, self.run_dir,
                                         self.training_scope.name)
示例#8
0
class Agent():
    def __init__(self, alpha = 0.0003, beta = 0.0003, input_dims = [8],
                 env = None, gamma = 0.99, tau = 0.005, n_actions = 2, max_size = 1000000,
                 layer1_size = 256, layer2_size = 256, batch_size = 256, reward_scale = 2):
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.n_actions = n_actions
        self.scale = reward_scale

        self.memory = ReplayBuffer(max_size, input_dims, n_actions = n_actions)
        self.actor = ActorNetwork(alpha, input_dims, n_actions = n_actions, max_action = env.action_space.high)
        self.critic1 = CriticNetwork(beta, input_dims, n_actions = n_actions, name = 'critic1')
        self.critic2 = CriticNetwork(beta, input_dims, n_actions = n_actions, name = 'critic2')
        self.value = ValueNetwork(beta, input_dims, name = 'value')
        self.target_value = ValueNetwork(beta, input_dims, name = 'target')
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

        self.update_network_params(tau = 1)

    def choose_action(self, obs):
        state = torch.tensor([obs],dtype=torch.float32).to(self.device)
        actions, _ = self.actor.sample_normal(state, reparam = False)

        return actions.cpu().detach().numpy()[0]

    def store_trans(self, state, action, reward, new_state, done):
        self.memory.store_trans(state, action, reward, new_state, done)

    def update_network_params(self, tau = None):
        if tau is None:
            tau = self.tau
        
        target_value_params = self.target_value.named_parameters()
        value_params = self.value.named_parameters()

        target_value_state_dict = dict(target_value_params)
        value_state_dict = dict(value_params)

        for name in value_state_dict.keys():
            value_state_dict[name] = tau * value_state_dict[name].clone() + \
                (1 - tau) * target_value_state_dict[name].clone()
            
        self.target_value.load_state_dict(value_state_dict)

    def save_models(self):
        self.actor.save_checkpoint()
        self.value.save_checkpoint()
        self.target_value.save_checkpoint()
        self.critic1.save_checkpoint()
        self.critic2.save_checkpoint()
        print('saving models')
    def load_models(self):
        self.actor.load_checkpoint()
        self.value.load_checkpoint()
        self.target_value.load_checkpoint()
        self.critic1.load_checkpoint()
        self.critic2.load_checkpoint()
        print('loading models')

    def get_critic_val_log_prob(self, state, reparam):
        actions, log_probs = self.actor.sample_normal(state, reparam = False)
        log_probs = log_probs.view(-1)
        q1_new = self.critic1(state, actions)
        q2_new = self.critic2(state, actions)
        critic_value = torch.min(q1_new, q2_new)
        critic_value = critic_value.view(-1)

        return log_probs, critic_value

    def learn(self):
        if self.memory.mem_counter < self.batch_size:
            return
        
        state, action, reward, new_state, done = \
                self.memory.sample_buffer(self.batch_size)

        reward = torch.tensor(reward, dtype=torch.float).to(self.actor.device)
        done = torch.tensor(done).to(self.actor.device)
        state_ = torch.tensor(new_state, dtype=torch.float).to(self.actor.device)
        state = torch.tensor(state, dtype=torch.float).to(self.actor.device)
        action = torch.tensor(action, dtype=torch.float).to(self.actor.device)

        value = self.value(state).view(-1)
        value_ = self.target_value(state_).view(-1)
        value_[done] = 0.0

        actions, log_probs = self.actor.sample_normal(state, reparam=False)
        log_probs = log_probs.view(-1)
        q1_new_policy = self.critic1.forward(state, actions)
        q2_new_policy = self.critic2.forward(state, actions)
        critic_value = torch.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        self.value.optimizer.zero_grad()
        value_target = critic_value - log_probs
        value_loss = 0.5 * F.mse_loss(value, value_target)
        value_loss.backward(retain_graph=True)
        self.value.optimizer.step()

        actions, log_probs = self.actor.sample_normal(state, reparam=True)
        log_probs = log_probs.view(-1)
        q1_new_policy = self.critic1.forward(state, actions)
        q2_new_policy = self.critic2.forward(state, actions)
        critic_value = torch.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)
        
        actor_loss = log_probs - critic_value
        actor_loss = torch.mean(actor_loss)
        self.actor.optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor.optimizer.step()

        self.critic1.optimizer.zero_grad()
        self.critic2.optimizer.zero_grad()
        q_hat = self.scale*reward + self.gamma*value_
        q1_old_policy = self.critic1.forward(state, action).view(-1)
        q2_old_policy = self.critic2.forward(state, action).view(-1)
        critic1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat)
        critic2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat)

        critic_loss = critic1_loss + critic2_loss
        critic_loss.backward()
        self.critic1.optimizer.step()
        self.critic2.optimizer.step()

        self.update_network_params()
示例#9
0
def main():


    
    # ENVIROMENT
    env_name = "CartPole-v1"
    # env_name = "LunarLander-v2"
    env = gym.make(env_name)
    n_actions = env.action_space.n
    feature_dim = env.observation_space.shape[0]

    # PARAMETERS
    learning_rate = 1e-3
    state_scale = 1.0
    reward_scale = 1.0
    clip = 0.2
    n_epoch = 4
    max_episodes = 10
    max_timesteps = 200
    batch_size = 32
    max_iterations = 200
    gamma = 0.99
    gae_lambda = 0.95
    entropy_coefficient = 0.01

    # NETWORK
    value_model = ValueNetwork(in_dim=feature_dim).to(device)
    value_optimizer = optim.Adam(value_model.parameters(), lr=learning_rate)

    policy_model = PolicyNetwork(in_dim=feature_dim, n=n_actions).to(device)
    policy_optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate)
    
    # INIT
    history = History()
    observation = env.reset()

    epoch_ite = 0
    episode_ite = 0
    train_ite = 0
    running_reward = -500

    # TENSORBOARD 
    timestr = time.strftime("%d%m%Y-%H%M%S-")

    log_dir = "./runs/" + timestr + env_name + "-BS" + str(batch_size) + "-E" + \
            str(max_episodes) + "-MT" + str(max_timesteps) + "-NE" + str(n_epoch) + \
            "-LR" + str(learning_rate) + "-G" + str(gamma) + "-L" + str(gae_lambda)

    writer = SummaryWriter(log_dir=log_dir)

    # LOAD MODEL
    # Create folder models
    if not Path("./models").exists():
        print("Creating Models folder")
        Path("./models").mkdir()

    model_path = Path("./models/" + env_name + ".tar")
    if model_path.exists():
        print("Loading model!")
        #Load model
        checkpoint = torch.load(model_path)
        policy_model.load_state_dict(checkpoint['policy_model'])
        policy_optimizer.load_state_dict(checkpoint['policy_optimizer'])
        value_model.load_state_dict(checkpoint['value_model'])
        value_optimizer.load_state_dict(checkpoint['value_optimizer'])
        running_reward = checkpoint['running_reward']
    

    for ite in tqdm(range(max_iterations), ascii=True):

        if ite % 5 == 0:
            torch.save({
                'policy_model': policy_model.state_dict(),
                'policy_optimizer': policy_optimizer.state_dict(),
                'value_model': value_model.state_dict(),
                'value_optimizer': value_optimizer.state_dict(),
                'running_reward': running_reward}, model_path)


        
        episode_ite, running_reward = collect(episode_ite, running_reward, env,
                                            max_episodes, max_timesteps, state_scale,
                                            reward_scale, writer, history, policy_model,
                                            value_model, gamma, gae_lambda, device)
        
        # Here we have collected N trajectories.
        history.build_dataset()

        data_loader = DataLoader(history, batch_size=batch_size, shuffle=True, drop_last=True)


        policy_loss, value_loss, train_ite = train_network(data_loader, policy_model, value_model,
                                                        policy_optimizer, value_optimizer ,n_epoch, clip,
                                                        train_ite, writer, entropy_coefficient)


        for p_l, v_l in zip(policy_loss, value_loss):
            epoch_ite += 1
            writer.add_scalar("Policy Loss", p_l, epoch_ite)
            writer.add_scalar("Value Loss", v_l, epoch_ite)

        history.free_memory()

        # print("\n", running_reward)

        writer.add_scalar("Running Reward", running_reward, epoch_ite)


        if (running_reward > env.spec.reward_threshold):
            print("\nSolved!")
            break
示例#10
0
def main(env_name, lr, state_scale, reward_scale, clip, train_epoch,
         max_episodes, max_timesteps, batch_size, max_iterations, gamma,
         gae_lambda, entropy_coefficient, start_running_reward, update_rate):

    # ENVIROMENT
    env_name = env_name
    env = ChessEnv()

    # PARAMETERS
    learning_rate = lr
    state_scale = state_scale
    reward_scale = reward_scale
    clip = clip
    n_epoch = train_epoch
    max_episodes = max_episodes
    max_timesteps = max_timesteps
    batch_size = batch_size
    max_iterations = max_iterations
    gamma = gamma
    gae_lambda = gae_lambda
    entropy_coefficient = entropy_coefficient

    # NETWORK
    value_model = ValueNetwork().to(device)
    value_optimizer = optim.Adam(value_model.parameters(), lr=learning_rate)

    policy_model = PolicyNetwork().to(device)
    policy_optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate)

    # INIT
    history = History()

    epoch_ite = 0
    episode_ite = 0
    train_ite = 0
    running_reward = start_running_reward

    # TENSORBOARD
    timestr = time.strftime("%d%m%Y-%H%M%S-")

    log_dir = "./runs/" + timestr + env_name + "-BS" + str(batch_size) + "-E" + \
            str(max_episodes) + "-MT" + str(max_timesteps) + "-NE" + str(n_epoch) + \
            "-LR" + str(learning_rate) + "-G" + str(gamma) + "-L" + str(gae_lambda)

    writer = SummaryWriter(log_dir=log_dir)

    # LOAD MODEL
    # Create folder models
    if not Path("./models").exists():
        print("Creating Models folder")
        Path("./models").mkdir()

    model_path = Path("./models/" + env_name + ".tar")
    if model_path.exists():
        print("Loading model!")
        #Load model
        checkpoint = torch.load(model_path)
        policy_model.load_state_dict(checkpoint['policy_model'])
        policy_optimizer.load_state_dict(checkpoint['policy_optimizer'])
        value_model.load_state_dict(checkpoint['value_model'])
        value_optimizer.load_state_dict(checkpoint['value_optimizer'])
        running_reward = checkpoint['running_reward']

    # Create SavedEnvs queue
    SavedEnv = queue.SimpleQueue()
    for _ in range(max_episodes):
        env = ChessEnv()
        SavedEnv.put((env, env.reset(), 0))

    # START ITERATING
    for ite in tqdm(range(max_iterations), ascii=True):
        # Load model to rival each update_rate epochs
        if ite % update_rate == 0:
            print("\nUpdating")
            rival_policy = PolicyNetwork().to(device)
            rival_policy.load_state_dict(policy_model.state_dict())

        if ite % 5 == 0:
            torch.save(
                {
                    'policy_model': policy_model.state_dict(),
                    'policy_optimizer': policy_optimizer.state_dict(),
                    'value_model': value_model.state_dict(),
                    'value_optimizer': value_optimizer.state_dict(),
                    'running_reward': running_reward
                }, model_path)

        print("\nSimulating")
        start_simulation = time.perf_counter()

        q = queue.SimpleQueue()

        env_list = []
        while not SavedEnv.empty():
            env_list.append(SavedEnv.get())

        threads = []
        for saved_env in env_list:
            t = threading.Thread(target=collect,
                                 args=[
                                     q, env_name, saved_env, SavedEnv,
                                     max_timesteps, state_scale, reward_scale,
                                     policy_model, value_model, gamma,
                                     gae_lambda, device, rival_policy
                                 ])
            t.start()
            threads.append(t)

        for t in threads:
            t.join()

        # for saved_env in env_list:
        #     if ite % 20 == 0:
        #         update_policy = True
        #     else:
        #         update_policy = False
        #     collect(q, env_name, saved_env,
        #                         SavedEnv, max_timesteps, state_scale, reward_scale,
        #                         policy_model, value_model, gamma,
        #                         gae_lambda, device, update_policy)

        avg_episode_reward = []
        # Write all episodes from queue to history buffer
        while not q.empty():
            episode, done = q.get()
            history.episodes.append(episode)
            avg_episode_reward.append((episode.reward, done))

        end_simulation = time.perf_counter()
        print(f"Simulation time: {end_simulation-start_simulation:.2f} ")

        for ep_reward, done in avg_episode_reward:
            if done:
                running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward

                writer.add_scalar("Average Episode Reward", ep_reward,
                                  episode_ite)
                episode_ite += 1

        # avg_ep_reward = sum(avg_episode_reward) / len(avg_episode_reward)

        # Here we have collected N trajectories and prepare dataset
        history.build_dataset()

        data_loader = DataLoader(history,
                                 batch_size=batch_size,
                                 shuffle=True,
                                 drop_last=True)

        print("Training")
        policy_loss, value_loss, train_ite = train_network(
            data_loader, policy_model, value_model, policy_optimizer,
            value_optimizer, n_epoch, clip, train_ite, writer,
            entropy_coefficient)

        end_training = time.perf_counter()
        print(f"Training time: {end_training-end_simulation:.2f}")

        for p_l, v_l in zip(policy_loss, value_loss):
            epoch_ite += 1
            writer.add_scalar("Policy Loss", p_l, epoch_ite)
            writer.add_scalar("Value Loss", v_l, epoch_ite)

        history.free_memory()

        # print("\n", running_reward)

        writer.add_scalar("Running Reward", running_reward, epoch_ite)

        if (running_reward > 0):
            print("\nSolved!")
            break
示例#11
0
def main():

    # ENVIROMENT
    # env_name = "CartPole-v1"
    # env_name = "LunarLander-v2"
    # env_name = "Acrobot-v1"
    env_name = "MountainCar-v0"
    env = gym.make(env_name)
    n_actions = env.action_space.n
    feature_dim = env.observation_space.shape[0]

    # PARAMETERS
    learning_rate = 1e-3
    state_scale = 1.0
    reward_scale = 1.0
    clip = 0.2
    n_epoch = 4
    max_episodes = 10
    max_timesteps = 100
    batch_size = 32
    max_iterations = 1000
    gamma = 0.99
    gae_lambda = 0.95
    entropy_coefficient = 0.01
    env_threshold = env.spec.reward_threshold

    # NETWORK
    value_model = ValueNetwork(in_dim=feature_dim).to(device)
    value_optimizer = optim.Adam(value_model.parameters(), lr=learning_rate)

    policy_model = PolicyNetwork(in_dim=feature_dim, n=n_actions).to(device)
    policy_optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate)

    # INIT
    history = History()
    observation = env.reset()

    epoch_ite = 0
    episode_ite = 0
    train_ite = 0
    running_reward = -500

    # TENSORBOARD
    timestr = time.strftime("%d%m%Y-%H%M%S-")

    log_dir = "./runs/" + timestr + env_name + "-BS" + str(batch_size) + "-E" + \
            str(max_episodes) + "-MT" + str(max_timesteps) + "-NE" + str(n_epoch) + \
            "-LR" + str(learning_rate) + "-G" + str(gamma) + "-L" + str(gae_lambda)

    writer = SummaryWriter(log_dir=log_dir)

    # LOAD MODEL
    # Create folder models
    if not Path("./models").exists():
        print("Creating Models folder")
        Path("./models").mkdir()

    model_path = Path("./models/" + env_name + ".tar")
    if model_path.exists():
        print("Loading model!")
        #Load model
        checkpoint = torch.load(model_path)
        policy_model.load_state_dict(checkpoint['policy_model'])
        policy_optimizer.load_state_dict(checkpoint['policy_optimizer'])
        value_model.load_state_dict(checkpoint['value_model'])
        value_optimizer.load_state_dict(checkpoint['value_optimizer'])
        running_reward = checkpoint['running_reward']

    EnvQueue = queue.SimpleQueue()

    for _ in range(max_episodes):
        env = gym.make(env_name)
        observation = env.reset()
        EnvQueue.put((env, observation, 0))

    for ite in tqdm(range(max_iterations), ascii=True):

        if ite % 5 == 0:
            torch.save(
                {
                    'policy_model': policy_model.state_dict(),
                    'policy_optimizer': policy_optimizer.state_dict(),
                    'value_model': value_model.state_dict(),
                    'value_optimizer': value_optimizer.state_dict(),
                    'running_reward': running_reward
                }, model_path)

        q = queue.SimpleQueue()

        env_list = []
        while not EnvQueue.empty():
            env_list.append(EnvQueue.get())

        threads = []
        for env in env_list:
            t = threading.Thread(target=collect,
                                 args=[
                                     q, env_name, env, EnvQueue, max_timesteps,
                                     state_scale, reward_scale, policy_model,
                                     value_model, gamma, gae_lambda, device
                                 ])
            t.start()
            threads.append(t)

        for t in threads:
            t.join()

        avg_episode_reward = []
        # Write all episodes from queue to history buffer
        while not q.empty():
            episode, done = q.get()
            history.episodes.append(episode)
            avg_episode_reward.append((episode.reward, done))

        for ep_reward, done in avg_episode_reward:
            if done:
                running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
                writer.add_scalar("Running Reward", running_reward,
                                  episode_ite)
                writer.add_scalar("Episode Reward", ep_reward, episode_ite)
                episode_ite += 1

        # avg_ep_reward = sum(avg_episode_reward) / len(avg_episode_reward)

        # Here we have collected N trajectories and prepare dataset
        history.build_dataset()

        data_loader = DataLoader(history,
                                 batch_size=batch_size,
                                 shuffle=True,
                                 drop_last=True)

        policy_loss, value_loss, train_ite = train_network(
            data_loader, policy_model, value_model, policy_optimizer,
            value_optimizer, n_epoch, clip, train_ite, writer,
            entropy_coefficient)

        for p_l, v_l in zip(policy_loss, value_loss):
            epoch_ite += 1
            writer.add_scalar("Policy Loss", p_l, epoch_ite)
            writer.add_scalar("Value Loss", v_l, epoch_ite)

        history.free_memory()

        # print("\n", running_reward)

        if (running_reward > env_threshold):
            print("\nSolved!")
            break
示例#12
0
文件: agent.py 项目: mfsuve/TORCS-RL
class SAC_Agent:
    def __init__(self, load_from=None, will_train=True):
        self.env = TorcsEnv(
            path='/usr/local/share/games/torcs/config/raceman/quickrace.xml')
        self.args = SAC_args()
        self.buffer = ReplayBuffer(self.args.buffer_size)

        action_dim = self.env.action_space.shape[0]
        state_dim = self.env.observation_space.shape[0]
        hidden_dim = 256

        self.action_size = action_dim
        self.state_size = state_dim

        self.value_net = ValueNetwork(state_dim,
                                      hidden_dim).to(self.args.device)
        self.target_value_net = ValueNetwork(state_dim,
                                             hidden_dim).to(self.args.device)

        self.soft_q_net1 = SoftQNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.args.device)
        self.soft_q_net2 = SoftQNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.args.device)

        self.policy_net = PolicyNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.args.device)

        self.target_value_net.load_state_dict(self.value_net.state_dict())

        self.value_criterion = nn.MSELoss()
        self.soft_q_loss1 = nn.MSELoss()
        self.soft_q_loss2 = nn.MSELoss()

        self.value_opt = optim.Adam(self.value_net.parameters(),
                                    lr=self.args.lr)
        self.soft_q_opt1 = optim.Adam(self.soft_q_net1.parameters(),
                                      lr=self.args.lr)
        self.soft_q_opt2 = optim.Adam(self.soft_q_net2.parameters(),
                                      lr=self.args.lr)
        self.policy_opt = optim.Adam(self.policy_net.parameters(),
                                     lr=self.args.lr)

        if will_train:
            current_time = time.strftime('%d-%b-%y-%H.%M.%S', time.localtime())
            self.plot_folder = f'plots/{current_time}'
            self.model_save_folder = f'model/{current_time}'
            make_sure_dir_exists(self.plot_folder)
            make_sure_dir_exists(self.model_save_folder)
            self.cp = Checkpoint(self.model_save_folder)

        if load_from is not None:
            try:
                self.load_checkpoint(load_from)
            except FileNotFoundError:
                print(f'{load_from} not found. Running default.')
        else:
            print('Starting from scratch.')

    def train(self):
        remove_log_file()
        clear_action_logs()
        eps_n = 0
        rewards = []
        test_rewards = []
        best_reward = -np.inf
        info = None
        for eps_n in range(1, self.args.max_eps + 1):  # Train loop
            self.set_mode('train')
            relaunch = (eps_n - 1) % (20 / self.args.test_rate) == 0
            state = self.env.reset(relaunch=relaunch,
                                   render=False,
                                   sampletrack=False)
            eps_r = 0
            sigma = (self.args.start_sigma - self.args.end_sigma) * (max(
                0, 1 - (eps_n - 1) / self.args.max_eps)) + self.args.end_sigma
            randomprocess = OrnsteinUhlenbeckProcess(self.args.theta, sigma,
                                                     self.action_size)

            for step in range(self.args.max_eps_time):  # Episode
                action = self.policy_net.get_train_action(state, randomprocess)
                next_state, reward, done, info = self.env.step(action)

                self.buffer.push(state, action, reward, next_state, done)

                state = next_state
                eps_r += reward

                if len(self.buffer) > self.args.batch_size:
                    self.update()

                if done:
                    break

            rewards.append(eps_r)

            test_reward = self.test(eps_n)
            test_rewards.append(test_reward)

            if test_reward > best_reward:
                best_reward = test_reward
                self.save_checkpoint(eps_n, best_reward)

            info_str = ', '.join(
                [key for key in info.keys() if key != 'place'])
            info_str += f", {info['place']}. place"
            log(f'Episode {eps_n:<4} Reward: {eps_r:>7.2f} Test Reward: {test_reward:>7.2f} Info: {info_str}'
                )

            if eps_n % self.args.plot_per == 0:
                self.plot(rewards, test_rewards, eps_n)

    def update(self):
        state, action, reward, next_state, done = self.buffer.sample(
            self.args.batch_size)

        state = FloatTensor(state).to(self.args.device)
        next_state = FloatTensor(next_state).to(self.args.device)
        action = FloatTensor(action).to(self.args.device)
        reward = FloatTensor(reward).unsqueeze(1).to(self.args.device)
        done = FloatTensor(np.float32(done)).unsqueeze(1).to(self.args.device)

        predicted_q_value1 = self.soft_q_net1(state, action)
        predicted_q_value2 = self.soft_q_net2(state, action)
        predicted_value = self.value_net(state)
        new_action, log_prob, epsilon, mean, log_std = self.policy_net.evaluate(
            state)

        # Training Q function
        target_value = self.target_value_net(next_state)
        target_q_value = reward + (1 - done) * self.args.gamma * target_value
        q_value_loss1 = self.soft_q_loss1(predicted_q_value1,
                                          target_q_value.detach())
        q_value_loss2 = self.soft_q_loss2(predicted_q_value2,
                                          target_q_value.detach())

        self.soft_q_opt1.zero_grad()
        q_value_loss1.backward()
        if self.args.clipgrad:
            self.clip_grad(self.soft_q_net1.parameters())
        self.soft_q_opt1.step()
        self.soft_q_opt2.zero_grad()
        q_value_loss2.backward()
        if self.args.clipgrad:
            self.clip_grad(self.soft_q_net2.parameters())
        self.soft_q_opt2.step()

        # Training Value function
        predicted_new_q_value = torch.min(self.soft_q_net1(state, new_action),
                                          self.soft_q_net2(state, new_action))
        target_value_func = predicted_new_q_value - self.args.alpha * log_prob.sum(
        )
        value_loss = self.value_criterion(predicted_value,
                                          target_value_func.detach())

        self.value_opt.zero_grad()
        value_loss.backward()
        if self.args.clipgrad:
            self.clip_grad(self.value_net.parameters())
        self.value_opt.step()

        # Training Policy function
        policy_loss = (log_prob - predicted_new_q_value).mean()

        self.policy_opt.zero_grad()
        policy_loss.backward()
        if self.args.clipgrad:
            self.clip_grad(self.policy_net.parameters())
        self.policy_opt.step()

        # Updating target value network
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(target_param.data *
                                    (1.0 - self.args.soft_tau) +
                                    param.data * self.args.soft_tau)

    def test(self, eps_n):
        self.set_mode('eval')
        rewards = []
        for step in range(self.args.test_rate):
            render = (eps_n % 30 == 0) and (step == 0)
            relaunch = render or ((eps_n % 30 == 0) and (step == 1))
            state = self.env.reset(relaunch=relaunch,
                                   render=render,
                                   sampletrack=False)
            running_reward = 0
            for t in range(self.args.max_eps_time):
                action = self.policy_net.get_test_action(state)
                state, reward, done, info = self.env.step(action)
                store(action, eps_n, reward, info, t == 0)
                running_reward += reward
                if done:
                    break
            rewards.append(running_reward)
        avg_reward = sum(rewards) / self.args.test_rate
        return avg_reward

    def plot(self, rewards, test_rewards, eps_n):
        torch.save({
            'train_rewards': rewards,
            'test_rewards': test_rewards
        }, f'{self.plot_folder}/{eps_n}.pth')
        figure = plt.figure()
        plt.plot(rewards, label='Train Rewards')
        plt.plot(test_rewards, label='Test Rewards')
        plt.xlabel('Episode')
        plt.legend()
        plt.savefig(f'{self.plot_folder}/{eps_n}.png')
        try:
            send_mail(f'Improved Torcs SAC | Episode {eps_n}',
                      f'{self.plot_folder}/{eps_n}.png')
            log('Mail has been sent.')
        except (KeyboardInterrupt, SystemExit):
            print('KeyboardInterrupt or SystemExit')
            raise
        except Exception as e:
            print('Mail Exception occured:', e)
            emsg = e.args[-1]
            emsg = emsg[:1].lower() + emsg[1:]
            log('Couldn\'t send mail because', emsg)

    def clip_grad(self, parameters):
        for param in parameters:
            param.grad.data.clamp_(-1, 1)

    def set_mode(self, mode):
        if mode == 'train':
            self.value_net.train()
            self.target_value_net.train()
            self.soft_q_net1.train()
            self.soft_q_net2.train()
            self.policy_net.train()
        elif mode == 'eval':
            self.value_net.eval()
            self.target_value_net.eval()
            self.soft_q_net1.eval()
            self.soft_q_net2.eval()
            self.policy_net.eval()
        else:
            raise ValueError('mode should be either train or eval')

    def save_checkpoint(self, eps_n, test_reward):
        self.cp.update(self.value_net, self.soft_q_net1, self.soft_q_net2,
                       self.policy_net)
        self.cp.save(f'e{eps_n}-r{test_reward:.4f}.pth')
        log(f'Saved checkpoint at episode {eps_n}.')

    def load_checkpoint(self, load_from):
        state_dicts = torch.load(load_from)
        self.value_net.load_state_dict(state_dicts['best_value'])
        self.soft_q_net1.load_state_dict(state_dicts['best_q1'])
        self.soft_q_net2.load_state_dict(state_dicts['best_q2'])
        self.policy_net.load_state_dict(state_dicts['best_policy'])
        print(f'Loaded from {load_from}.')

    def race(self, sampletrack=True):
        with torch.no_grad():
            state = self.env.reset(relaunch=True,
                                   render=True,
                                   sampletrack=sampletrack)
            running_reward = 0
            done = False
            while not done:
                action = self.policy_net.get_test_action(state)
                state, reward, done, info = self.env.step(action)
                running_reward += reward

            print('Reward:', running_reward)
示例#13
0
class Agent():
    def __init__(self,
                 alpha,
                 beta,
                 input_dims,
                 tau,
                 env,
                 env_id,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 layer_1_size=256,
                 layer_2_size=256,
                 batch_size=100,
                 reward_scale=2):

        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)

        self.batch_size = batch_size
        self.n_actions = n_actions
        self.scale = reward_scale
        self.actor = ActorNetwork(alpha,
                                  input_dims,
                                  layer_1_size,
                                  layer_2_size,
                                  n_actions=n_actions,
                                  name=env_id + '_actor',
                                  max_action=env.action_space.high)

        self.critic_1 = CriticNetwork(beta,
                                      input_dims,
                                      layer_1_size,
                                      layer_2_size,
                                      n_actions=n_actions,
                                      name=env_id + '_critic_1')

        self.critic_2 = CriticNetwork(beta,
                                      input_dims,
                                      layer_1_size,
                                      layer_2_size,
                                      n_actions=n_actions,
                                      name=env_id + '_critic_2')

        self.value = ValueNetwork(beta,
                                  input_dims,
                                  layer_1_size,
                                  layer_2_size,
                                  name=env_id + '_value')

        self.target_value = ValueNetwork(beta,
                                         input_dims,
                                         layer_1_size,
                                         layer_2_size,
                                         name=env_id + '_target_value')

        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        state = T.tensor([observation], dtype=T.float).to(self.actor.device)
        actions, _ = self.actor.sample_normal(state, reparameterize=False)
        return actions.cpu().detach().numpy()[0]

    def remember(self, state, action, reward, state_, done):
        self.memory.store_transitions(state, action, reward, state_, done)

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        value_params = self.value.named_parameters()
        target_value_params = self.target_value.named_parameters()

        value_state_dict = dict(value_params)
        target_value_state_dict = dict(target_value_params)

        for name in value_state_dict:
            value_state_dict[name] = tau*value_state_dict[name].clone() \
                + (1-tau)*target_value_state_dict[name].clone()

        self.target_value.load_state_dict(value_state_dict)

    def save_models(self):
        print('.... saving models ....')
        self.actor.save_checkpoint()
        self.value.save_checkpoint()
        self.target_value.save_checkpoint()
        self.critic_1.save_checkpoint()
        self.critic_2.save_checkpoint()

    def load_models(self):
        print('.... loading models ....')
        self.actor.load_checkpoint()
        self.value.load_checkpoint()
        self.target_value.load_checkpoint()
        self.critic_1.load_checkpoint()
        self.critic_2.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        state, action, reward, state_, done =\
            self.memory.sample_buffer(self.batch_size)

        state = T.tensor(state, dtype=T.float).to(self.critic_1.device)
        state_ = T.tensor(state_, dtype=T.float).to(self.critic_1.device)
        action = T.tensor(action, dtype=T.float).to(self.critic_1.device)
        reward = T.tensor(reward, dtype=T.float).to(self.critic_1.device)
        done = T.tensor(done).to(self.critic_1.device)

        value = self.value(state).view(-1)
        value_ = self.target_value(state_).view(-1)
        value_[done] = 0.0

        actions, log_probs = self.actor.sample_normal(state,
                                                      reparameterize=False)
        log_probs = log_probs.view(-1)
        q1_new_policy = self.critic_1(state, actions)
        q2_new_policy = self.critic_2(state, actions)
        critic_value = T.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        self.value.optimizer.zero_grad()
        value_target = critic_value - log_probs
        value_loss = 0.5 * F.mse_loss(value, value_target)
        value_loss.backward(retain_graph=True)
        self.value.optimizer.step()

        actions, log_probs = self.actor.sample_normal(state,
                                                      reparameterize=True)
        log_probs = log_probs.view(-1)
        q1_new_policy = self.critic_1(state, actions)
        q2_new_policy = self.critic_2(state, actions)
        critic_value = T.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        actor_loss = log_probs - critic_value
        actor_loss = T.mean(actor_loss)
        self.actor.optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor.optimizer.step()

        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()

        q_hat = self.scale * reward + self.gamma * value_
        q1_old_policy = self.critic_1(state, action).view(-1)
        q2_old_policy = self.critic_2(state, action).view(-1)
        critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat)
        critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat)

        critic_loss = critic_1_loss + critic_2_loss
        critic_loss.backward()
        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()

        self.update_network_parameters()
示例#14
0
    numrun = 1
    
    for run in range(numrun):
        env = make_env()

        in_size = env.observation_space.shape[0]
        num_actions = env.action_space.n

        network = network_factory(in_size, num_actions, env)
        network.to(device)
        pe = PolicyNetwork(network)
        
        # Load policy to test
        #pe.network.load_state_dict(torch.load('saved_network_50000_baseline.pkl'))
        
        ve = ValueNetwork(in_size)
        ep_returns = reinforce(env, pe, ve, episodes) #,ve , loss_policy, loss_value
            
        #fwrite = open('runs_data/'+str(run)+'.pkl','wb')
        #fwrite = open('runs_data/0.pkl','wb')
        #pickle.dump(ep_returns, fwrite)
        #fwrite.close()
            
        
        
    window = 100
    plt.figure(figsize=(12,8))
    plt.plot(sliding_window(ep_returns, window))
    plt.title("Episode Return")
    plt.xlabel("Episode")
    plt.ylabel("Average Return (Sliding Window 100)")
示例#15
0
class Agent():
    def __init__(self,
                 alpha=0.0003,
                 beta=.0003,
                 input_dims=[8],
                 env=None,
                 gamma=.99,
                 n_actions=2,
                 max_size=1000000,
                 layer1_size=256,
                 layer2_size=256,
                 tau=.005,
                 batch_size=256,
                 reward_scale=2):
        # reward scales  depends on action convention for the environment
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions
        # set up classes
        self.actor = ActorNetwork(alpha,
                                  input_dims,
                                  max_action=env.action_space.high,
                                  n_actions=n_actions,
                                  name='actor')

        self.critic1 = CriticNetwork(beta,
                                     input_dims,
                                     n_actions=n_actions,
                                     name='critic_1')
        self.critic2 = CriticNetwork(beta,
                                     input_dims,
                                     n_actions=n_actions,
                                     name='critic_2')

        self.value = ValueNetwork(beta, input_dims, name='value')
        # target value
        self.target_value = ValueNetwork(beta, input_dims, name='target_value')
        self.scale = reward_scale
        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        # here we turn into a tensor
        state = T.tensor([observation]).to(self.actor.device).float()
        # print(type(state))
        actions, _ = self.actor.sample_normal(state, reparameterize=False)
        return actions.cpu().detach().numpy()[0]

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        target_value_params = self.target_value.named_parameters()
        value_params = self.value.named_parameters()

        target_value_state_dict = dict(target_value_params)
        value_state_dict = dict(value_params)

        for name in value_state_dict:
            value_state_dict[name] = tau * value_state_dict[name].clone() + (
                1 - tau) * target_value_state_dict[name].clone()
        self.target_value.load_state_dict(value_state_dict)

    def save_models(self):
        print("saving models:")
        self.actor.save_checkpoint()
        self.value.save_checkpoint()
        self.target_value.save_checkpoint()
        self.critic1.save_checkpoint()
        self.critic2.save_checkpoint()

    def load_models(self):
        print("loading models:")
        self.actor.load_checkpoint()
        self.value.load_checkpoint()
        self.target_value.load_checkpoint()
        self.critic1.load_checkpoint()
        self.critic2.load_checkpoint()

    def learn(self):
        #  must fully load up memory, otherwise must keep learning
        if self.memory.mem_cntr < self.batch_size:
            return
        state, action, reward, new_state, done = self.memory.sample_buffer(
            self.batch_size)

        reward = T.tensor(reward, dtype=T.float).to(self.actor.device)
        done = T.tensor(done).to(self.actor.device)
        state_ = T.tensor(new_state, dtype=T.float).to(self.actor.device)
        state = T.tensor(state, dtype=T.float).to(self.actor.device)
        action = T.tensor(action, dtype=T.float).to(self.actor.device)

        value = self.value(state).view(-1)
        value_ = self.target_value(state_).view(-1)
        value_[done] = 0.0

        actions, log_probs = self.actor.sample_normal(state,
                                                      reparameterize=False)
        log_probs = log_probs.view(-1)
        q1_new_policy = self.critic1.forward(state, actions)
        q2_new_policy = self.critic2.forward(state, actions)
        critic_value = T.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        self.value.optimizer.zero_grad()
        value_target = critic_value - log_probs
        value_loss = .5 * F.mse_loss(value, value_target)
        value_loss.backward(retain_graph=True)
        self.value.optimizer.step()

        actions, log_probs = self.actor.sample_normal(state,
                                                      reparameterize=True)
        log_probs = log_probs.view(-1)
        q1_new_policy = self.critic1.forward(state, actions)
        q2_new_policy = self.critic2.forward(state, actions)
        critic_value = T.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        actor_loss = log_probs - critic_value
        actor_loss = T.mean(actor_loss)
        self.actor.optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor.optimizer.step()

        self.critic1.optimizer.zero_grad()
        self.critic2.optimizer.zero_grad()
        q_hat = self.scale * reward + self.gamma * value_
        q1_old_policy = self.critic1.forward(state, action).view(-1)
        q2_old_policy = self.critic2.forward(state, action).view(-1)
        critic_1_loss = .5 * F.mse_loss(q1_old_policy, q_hat)
        critic_2_loss = .5 * F.mse_loss(q2_old_policy, q_hat)

        critic_loss = critic_1_loss + critic_2_loss
        critic_loss.backward()
        self.critic1.optimizer.step()
        self.critic2.optimizer.step()

        self.update_network_parameters()