示例#1
0
 def init_value_network(self,
                        shared_network=None,
                        activation='linear',
                        loss='mse'):
     if self.rl_method == 'td3':
         self.critic = CriticNetwork(input_dim=self.num_features,
                                     output_dim=self.agent.NUM_ACTIONS,
                                     num_steps=self.num_steps,
                                     activation=activation,
                                     loss=loss,
                                     lr=self.lr)
     elif self.net == 'dnn':
         self.value_network = DNN(input_dim=self.num_features,
                                  output_dim=self.agent.NUM_ACTIONS,
                                  lr=self.lr,
                                  shared_network=shared_network,
                                  activation=activation,
                                  loss=loss)
     elif self.net == 'lstm':
         self.value_network = LSTMNetwork(input_dim=self.num_features,
                                          output_dim=self.agent.NUM_ACTIONS,
                                          lr=self.lr,
                                          num_steps=self.num_steps,
                                          shared_network=shared_network,
                                          activation=activation,
                                          loss=loss)
     if self.reuse_models and \
             os.path.exists(self.value_network_path):
         self.value_network.load_model(model_path=self.value_network_path)
示例#2
0
    def __init__(self, alpha, beta, input_dims, tau, env,
            env_id, gamma=0.99, 
            n_actions=2, max_size=1000000, layer1_size=256,
            layer2_size=256, batch_size=100, reward_scale=2):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions

        self.actor = ActorNetwork(alpha, input_dims, layer1_size,
                                  layer2_size, n_actions=n_actions,
                                  name=env_id+'_actor', 
                                  max_action=env.action_space.high)
        self.critic_1 = CriticNetwork(beta, input_dims, layer1_size,
                                      layer2_size, n_actions=n_actions,
                                      name=env_id+'_critic_1')
        self.critic_2 = CriticNetwork(beta, input_dims, layer1_size,
                                      layer2_size, n_actions=n_actions,
                                      name=env_id+'_critic_2')
       
        self.value = ValueNetwork(beta, input_dims, layer1_size,
                                      layer2_size, name=env_id+'_value')
        self.target_value = ValueNetwork(beta, input_dims, layer1_size,
                                         layer2_size, name=env_id+'_target_value')

        self.scale = reward_scale
        self.update_network_parameters(tau=1)
示例#3
0
文件: SAC.py 项目: bhargavCSSE/adv-RL
    def __init__(self,
                 alpha=0.0003,
                 beta=0.0003,
                 input_dims=[8],
                 env=None,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 tau=0.005,
                 ent_alpha=0.02,
                 batch_size=256,
                 reward_scale=2,
                 layer1_size=256,
                 layer2_size=256,
                 chkpt_dir='tmp/sac'):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions
        self.ent_alpha = ent_alpha

        self.actor = ActorNetwork(alpha,
                                  input_dims,
                                  n_actions=n_actions,
                                  fc1_dims=layer1_size,
                                  fc2_dims=layer2_size,
                                  name='actor',
                                  max_action=env.action_space.high,
                                  chkpt_dir=chkpt_dir)

        self.critic_1 = CriticNetwork(beta,
                                      input_dims,
                                      n_actions=n_actions,
                                      fc1_dims=layer1_size,
                                      fc2_dims=layer2_size,
                                      name='critic_1',
                                      chkpt_dir=chkpt_dir)
        self.critic_2 = CriticNetwork(beta,
                                      input_dims,
                                      n_actions=n_actions,
                                      fc1_dims=layer1_size,
                                      fc2_dims=layer2_size,
                                      name='critic_2',
                                      chkpt_dir=chkpt_dir)
        self.value = ValueNetwork(beta,
                                  input_dims,
                                  fc1_dims=layer1_size,
                                  fc2_dims=layer2_size,
                                  name='value',
                                  chkpt_dir=chkpt_dir)
        self.target_value = ValueNetwork(beta,
                                         input_dims,
                                         fc1_dims=layer1_size,
                                         fc2_dims=layer2_size,
                                         name='target_value',
                                         chkpt_dir=chkpt_dir)

        self.scale = reward_scale
        self.update_network_parameters(tau=1)
    def __init__(self,
                 input_dims,
                 alpha=0.001,
                 beta=0.002,
                 env=None,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 tau=0.005,
                 fc1=400,
                 fc2=300,
                 batch_size=64,
                 noise=0.1):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions
        self.noise = noise
        self.max_action = env.action_space.high[0]
        self.min_action = env.action_space.low[0]

        self.actor = ActorNetwork(n_actions=n_actions, name='actor')
        self.critic = CriticNetwork(name='critic')
        self.target_actor = ActorNetwork(n_actions=n_actions,
                                         name='target_actor')
        self.target_critic = CriticNetwork(name='target_critic')

        self.actor.compile(optimizer=Adam(learning_rate=alpha))
        self.critic.compile(optimizer=Adam(learning_rate=beta))
        self.target_actor.compile(optimizer=Adam(learning_rate=alpha))
        self.target_critic.compile(optimizer=Adam(learning_rate=beta))

        self.update_network_parameters(tau=1)
    def __init__(self,
                 alpha=0.0003,
                 beta=0.0003,
                 input_dims=[8],
                 env=None,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 tau=0.005,
                 layer1_size=256,
                 layer2_size=256,
                 batch_size=256,
                 reward_scale=2):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions

        self.actor = ActorNetwork(n_actions=n_actions,
                                  name='actor',
                                  max_action=env.action_space.high)
        self.critic_1 = CriticNetwork(n_actions=n_actions, name='critic_1')
        self.critic_2 = CriticNetwork(n_actions=n_actions, name='critic_2')
        self.value = ValueNetwork(name='value')
        self.target_value = ValueNetwork(name='target_value')

        self.actor.compile(optimizer=Adam(learning_rate=alpha))
        self.critic_1.compile(optimizer=Adam(learning_rate=beta))
        self.critic_2.compile(optimizer=Adam(learning_rate=beta))
        self.value.compile(optimizer=Adam(learning_rate=beta))
        self.target_value.compile(optimizer=Adam(learning_rate=beta))

        self.scale = reward_scale
        self.update_network_parameters(tau=1)
示例#6
0
    def __init__(self,
                 alpha=3e-4,
                 beta=3e-4,
                 input_dims=[8],
                 env=None,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 tau=5e-3,
                 fc1_dim=256,
                 fc2_dim=256,
                 batch_size=256,
                 reward_scale=2):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions

        self.actor = ActorNetwork(alpha, input_dims, n_actions,
                                  env.action_space.high)
        self.critic1 = CriticNetwork(beta,
                                     input_dims,
                                     n_actions,
                                     name='critic1')
        self.critic2 = CriticNetwork(beta,
                                     input_dims,
                                     n_actions,
                                     name='critic2')
        self.value = ValueNetwork(beta, input_dims, name='value')
        self.target_value = ValueNetwork(beta, input_dims, name='target_value')

        self.scale = reward_scale
        self.update_network_parameters(tau=1)
示例#7
0
    def __init__(self,
                 actor_dims,
                 critic_dims,
                 n_actions,
                 n_agents,
                 agent_idx,
                 chkpt_dir,
                 alpha=0.01,
                 beta=0.01,
                 fc1=64,
                 fc2=64,
                 gamma=0.95,
                 tau=0.01):
        self.gamma = gamma
        self.tau = tau
        self.n_actions = n_actions
        self.agent_name = 'agent_%s' % agent_idx
        self.actor = ActorNetwork(alpha,
                                  actor_dims,
                                  fc1,
                                  fc2,
                                  n_actions,
                                  chkpt_dir=chkpt_dir,
                                  name=self.agent_name + '_actor')
        self.critic = CriticNetwork(beta,
                                    critic_dims,
                                    fc1,
                                    fc2,
                                    n_agents,
                                    n_actions,
                                    chkpt_dir=chkpt_dir,
                                    name=self.agent_name + '_critic')
        self.target_actor = ActorNetwork(alpha,
                                         actor_dims,
                                         fc1,
                                         fc2,
                                         n_actions,
                                         chkpt_dir=chkpt_dir,
                                         name=self.agent_name +
                                         '_target_actor')
        self.target_critic = CriticNetwork(beta,
                                           critic_dims,
                                           fc1,
                                           fc2,
                                           n_agents,
                                           n_actions,
                                           chkpt_dir=chkpt_dir,
                                           name=self.agent_name +
                                           '_target_critic')

        self.update_network_parameters(tau=1)
示例#8
0
    def __init__(self, input_dims, env, n_actions):
        self.memory = ReplayBuffer(input_dims)
        self.n_actions = n_actions

        self.actor_nn = ActorNetwork(input_dims,
                                     n_actions=n_actions,
                                     name=Constants.env_id + '_actor',
                                     max_action=env.action_space.n)
        self.critic_local_1_nn = CriticNetwork(input_dims,
                                               n_actions=n_actions,
                                               name=Constants.env_id +
                                               '_critic_local_1')
        self.critic_local_2_nn = CriticNetwork(input_dims,
                                               n_actions=n_actions,
                                               name=Constants.env_id +
                                               '_critic_local_2')
        self.critic_target_1_nn = CriticNetwork(input_dims,
                                                n_actions=n_actions,
                                                name=Constants.env_id +
                                                '_critic_target_1')
        self.critic_target_2_nn = CriticNetwork(input_dims,
                                                n_actions=n_actions,
                                                name=Constants.env_id +
                                                '_critic_target_2')
        self.value_nn = ValueNetwork(input_dims,
                                     name=Constants.env_id + '_value')
        self.target_value_nn = ValueNetwork(input_dims,
                                            name=Constants.env_id +
                                            '_target_value')
        self.update_network_parameters(tau=1)
示例#9
0
    def __init__(self,
                 alpha,
                 beta,
                 input_dims,
                 tau,
                 n_actions,
                 gamma=0.99,
                 max_size=1000000,
                 fc1_dims=400,
                 fc2_dims=300,
                 batch_size=64):
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.alpha = alpha
        self.beta = beta

        self.memory = ReplayBuffer(max_size, input_dims, n_actions)

        self.noise = OUActionNoise(mu=np.zeros(n_actions))

        self.actor = ActorNetwork(alpha,
                                  input_dims,
                                  fc1_dims,
                                  fc2_dims,
                                  n_actions=n_actions,
                                  name='actor')
        self.critic = CriticNetwork(beta,
                                    input_dims,
                                    fc1_dims,
                                    fc2_dims,
                                    n_actions=n_actions,
                                    name='critic')

        self.target_actor = ActorNetwork(alpha,
                                         input_dims,
                                         fc1_dims,
                                         fc2_dims,
                                         n_actions=n_actions,
                                         name='target_actor')

        self.target_critic = CriticNetwork(beta,
                                           input_dims,
                                           fc1_dims,
                                           fc2_dims,
                                           n_actions=n_actions,
                                           name='target_critic')

        self.update_network_parameters(tau=1)
示例#10
0
    def __init__(self,
                 docker_client,
                 name='worker',
                 port=3101,
                 model_path='../models/ddpg',
                 log_path='../logs/ddpg'):

        self.state_size = 29
        self.action_size = 3

        self.docker_client = docker_client

        self.buffer_size = 100000
        self.batch_size = 32
        self.gamma = 0.99  # disocunt factor
        self.tau = 0.001  # Target Network HyperParameters
        self.lra = 0.0001  # Learning rate for Actor
        self.lrc = 0.001  # Lerning rate for Critic
        seed(6486)

        self.explore = 100000.
        self.episode_count = 2000
        self.max_steps = 10000
        self.epsilon = 1

        self.model_path = model_path
        self.port = port
        self.name = name

        if not os.path.exists(self.model_path):
            os.makedirs(self.model_path)

        self.config = tf.ConfigProto()
        self.config.gpu_options.allow_growth = True
        tf.reset_default_graph()

        self.summary_writer = tf.summary.FileWriter(log_path)

        self.actor = ActorNetwork(self.state_size, self.action_size,
                                  tf.train.AdamOptimizer(self.lra), self.tau)

        self.critic = CriticNetwork(self.state_size, self.action_size,
                                    tf.train.AdamOptimizer(self.lrc), self.tau)

        self.buff = ReplayBuffer(self.buffer_size)
        self.saver = tf.train.Saver()
        self._create_summary()
        self.summary_histogram = tf.summary.merge_all()
示例#11
0
def main(_):
    '''
    previous = tf.train.import_meta_graph(SAVE_DIR + '/model.ckpt.meta')
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        previous.restore(sess,tf.train.latest_checkpoint(SAVE_DIR+'/'))
        last_vars = tf.trainable_variables()
        data = sess.run(last_vars)
        print('Model Restored')
    '''
    tf.reset_default_graph()

    with tf.Session() as sess:
        env = Preon_env(opt.env_params)
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)

        state_dim = 9
        action_dim = 3
        goal_dim = 2

        actor = ActorNetwork(sess, state_dim, action_dim, goal_dim,
                             ACTOR_LEARNING_RATE, TAU, opt.env_params)
        critic = CriticNetwork(sess, state_dim, action_dim, goal_dim,
                               CRITIC_LEARNING_RATE, TAU,
                               actor.get_num_trainable_vars(), opt.env_params)

        if opt.train:
            train(sess, env, actor, critic, action_dim, goal_dim, state_dim)
        else:
            test(sess, env, actor, critic, action_dim, goal_dim, state_dim,
                 opt.test_goal)
示例#12
0
    def __init__(self,
                 alpha,
                 beta,
                 input_dims,
                 action_bound,
                 tau,
                 env,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 layer1_size=400,
                 layer2_size=300,
                 batch_size=64):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.action_bound = action_bound
        self.actor = ActorNetwork(alpha,
                                  input_dims,
                                  layer1_size,
                                  layer2_size,
                                  n_actions=n_actions,
                                  name='Actor')
        self.critic = CriticNetwork(beta,
                                    input_dims,
                                    layer1_size,
                                    layer2_size,
                                    n_actions=n_actions,
                                    name='Critic')

        self.target_actor = ActorNetwork(alpha,
                                         input_dims,
                                         layer1_size,
                                         layer2_size,
                                         n_actions=n_actions,
                                         name='TargetActor')
        self.target_critic = CriticNetwork(beta,
                                           input_dims,
                                           layer1_size,
                                           layer2_size,
                                           n_actions=n_actions,
                                           name='TargetCritic')

        self.noise = OUActionNoise(mu=np.zeros(n_actions))

        self.update_network_parameters(tau=1)
    def __init__(self,
                 n_actions,
                 n_states,
                 obs_shape,
                 gamma=0.99,
                 lr=0.0003,
                 gae_lambda=0.95,
                 entropy_coeff=0.0005,
                 ppo_clip=0.2,
                 mini_batch_size=64,
                 n_epochs=10,
                 clip_value_loss=True,
                 normalize_observation=False,
                 stop_normalize_obs_after_timesteps=50000,
                 fc1=64,
                 fc2=64,
                 environment='None',
                 run=0):

        self.entropy_coeff = entropy_coeff
        self.clip_value_loss = clip_value_loss
        self.gamma = gamma
        self.ppo_clip = ppo_clip
        self.n_epochs = n_epochs
        self.gae_lambda = gae_lambda
        self.normalize_observation = normalize_observation
        self.stop_obs_timesteps = stop_normalize_obs_after_timesteps
        self.timestep = 0

        self.actor = ActorNetwork(n_states=n_states,
                                  n_actions=n_actions,
                                  lr=lr,
                                  fc1_dims=fc1,
                                  fc2_dims=fc2,
                                  chkpt_dir=environment,
                                  run=run)
        self.critic = CriticNetwork(n_states=n_states,
                                    lr=lr,
                                    fc1_dims=fc1,
                                    fc2_dims=fc2,
                                    chkpt_dir=environment,
                                    run=run)
        self.memory = PPOMemory(mini_batch_size, gamma, gae_lambda)
        self.running_stats = RunningStats(shape_states=obs_shape,
                                          chkpt_dir=environment,
                                          run=run)
示例#14
0
    def __init__(self, alpha, beta, input_dims, tau, gamma=0.99, max_action=1.0, \
                    n_actions=2, max_size=1000000, layer1_size=400, \
                    layer2_size=300, batch_size=100, reward_scale=2, path_dir='model/sac'):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions

        self.actor = ActorNetwork(alpha,
                                  input_dims,
                                  layer1_size,
                                  layer2_size,
                                  n_actions=n_actions,
                                  name='_actor',
                                  max_action=max_action,
                                  chkpt_dir=path_dir)
        self.critic_1 = CriticNetwork(beta,
                                      input_dims,
                                      layer1_size,
                                      layer2_size,
                                      n_actions=n_actions,
                                      name='_critic_1',
                                      chkpt_dir=path_dir)
        self.critic_2 = CriticNetwork(beta,
                                      input_dims,
                                      layer1_size,
                                      layer2_size,
                                      n_actions=n_actions,
                                      name='_critic_2',
                                      chkpt_dir=path_dir)
        self.value = ValueNetwork(beta,
                                  input_dims,
                                  layer1_size,
                                  layer2_size,
                                  name='_value',
                                  chkpt_dir=path_dir)
        self.target_value = ValueNetwork(beta,
                                         input_dims,
                                         layer1_size,
                                         layer2_size,
                                         name='_target_value',
                                         chkpt_dir=path_dir)

        self.scale = reward_scale
        self.update_network_parameters(tau=1)
    def __init__(self,
                 state_size: int,
                 action_size: int,
                 gamma: float = 0.99,
                 lr_actor: float = 0.001,
                 lr_critic: float = 0.003,
                 weight_decay: float = 0.0001,
                 tau: float = 0.001,
                 buffer_size: int = 100000,
                 batch_size: int = 64):
        """
        :param state_size: how many states does the agent get as input (input size of neural networks)
        :param action_size: from how many actions can the agent choose
        :param gamma: discount factor
        :param lr_actor: learning rate of the actor network
        :param lr_critic: learning rate of the critic network
        :param weight_decay:
        :param tau: soft update parameter
        :param buffer_size: size of replay buffer
        :param batch_size: size of learning batch (mini-batch)
        """
        self.tau = tau
        self.gamma = gamma

        self.batch_size = batch_size

        self.actor_local = ActorNetwork(state_size, action_size).to(device)
        self.actor_target = ActorNetwork(state_size, action_size).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)
        print(self.actor_local)

        self.critic_local = CriticNetwork(state_size, action_size).to(device)
        self.critic_target = CriticNetwork(state_size, action_size).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)
        print(self.critic_local)

        self.hard_update(self.actor_local, self.actor_target)
        self.hard_update(self.critic_local, self.critic_target)

        self.memory = ReplayBuffer(action_size, buffer_size, batch_size)
        # this would probably also work with Gaussian noise instead of Ornstein-Uhlenbeck process
        self.noise = OUNoise(action_size)
示例#16
0
    def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8],
                 env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005,
                 layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions

        self.actor = ActorNetwork(alpha, input_dims, n_actions=n_actions,
                                  name='actor', max_action=env.action_space.high)
        self.critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions,
                                      name='critic_1')
        self.critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions,
                                      name='critic_2')
        self.value = ValueNetwork(beta, input_dims, name='value')
        self.target_value = ValueNetwork(beta, input_dims, name='target_value')

        self.scale = reward_scale
        self.update_network_parameters(tau=1)                                       #sets the parameters of Target-network equals to the
示例#17
0
    def __init__(self,
                 n_actions,
                 input_dims,
                 gamma=0.99,
                 alpha=0.0003,
                 gae_lambda=0.95,
                 policy_clip=0.2,
                 batch_size=64,
                 n_epochs=10,
                 chkpt_dir='models/'):
        self.gamma = gamma
        self.policy_clip = policy_clip
        self.n_epochs = n_epochs
        self.gae_lambda = gae_lambda
        self.chkpt_dir = chkpt_dir

        self.actor = ActorNetwork(n_actions)
        self.actor.compile(optimizer=Adam(learning_rate=alpha))
        self.critic = CriticNetwork()
        self.critic.compile(optimizer=Adam(learning_rate=alpha))
        self.memory = PPOMemory(batch_size)
示例#18
0
    def __init__(self, logger, replay_buffer):
        super(Learner, self).__init__(name="Learner")
        self.device = Params.DEVICE

        with tf.device(self.device), self.name_scope:
            self.dtype = Params.DTYPE
            self.logger = logger
            self.batch_size = Params.MINIBATCH_SIZE
            self.gamma = Params.GAMMA
            self.tau = Params.TAU
            self.replay_buffer = replay_buffer
            self.priority_beta = tf.Variable(Params.BUFFER_PRIORITY_BETA_START)
            self.running = tf.Variable(True)
            self.n_steps = tf.Variable(0)

            # Init Networks
            self.actor = ActorNetwork(with_target_net=True)
            self.critic = CriticNetwork()

            # Save shared variables
            self.policy_variables = self.actor.tvariables + self.actor.nvariables
示例#19
0
    def __init__(self,
                 alpha=0.00005,
                 beta=0.00005,
                 input_dims=5,
                 env=None,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 tau=0.005,
                 layer1_size=256,
                 layer2_size=256,
                 batch_size=256,
                 reward_scale=2):
        self.gamma = 0.99
        self.tau = tau
        self.memeory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions
        latent_dims = 10

        self.actor = ActorNetwork_2(alpha,
                                    latent_dims,
                                    env.action_space.high,
                                    n_actions=n_actions)
        self.critic_1 = CriticNetwork(beta,
                                      latent_dims,
                                      n_actions,
                                      name='critic_det_1')
        self.critic_2 = CriticNetwork(beta,
                                      latent_dims,
                                      n_actions,
                                      name='critic__det_2')
        self.value = ValueNetwork(beta, latent_dims, name='value_det')
        self.target_value = ValueNetwork(beta,
                                         latent_dims,
                                         name='target_value_det')
        self.VAE = LinearVAE()

        self.scale = reward_scale
        self.update_network_parameters(tau=1)
示例#20
0
    def __init__(self,
                 num_agents=2,
                 obs_size=24,
                 act_size=2,
                 gamma=0.99,
                 tau=1e-3,
                 lr_actor=1.0e-4,
                 lr_critic=1.0e-3,
                 weight_decay_actor=1e-5,
                 weight_decay_critic=1e-4,
                 clip_grad=1.0):
        super(MADDPGAgent, self).__init__()

        # Write parameters
        self.num_agents = num_agents
        self.gamma = gamma
        self.tau = tau
        self.clip_grad = clip_grad

        # Create all the networks
        self.actor = ActorNetwork(obs_size, act_size).to(device)
        self.critic = CriticNetwork(num_agents, obs_size, act_size).to(device)
        self.target_actor = ActorNetwork(obs_size, act_size).to(device)
        self.target_critic = CriticNetwork(num_agents, obs_size,
                                           act_size).to(device)

        # Copy initial network parameters to target networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        # Initialize training optimizers and OU noise
        self.noise = OUNoise(act_size, scale=1.0)
        self.actor_optimizer = Adam(self.actor.parameters(),
                                    lr=lr_actor,
                                    weight_decay=weight_decay_actor)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=weight_decay_critic)
示例#21
0
    def __init__(self,
                 alpha=0.0003,
                 beta=0.0003,
                 input_dims=[8],
                 max_action=1,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 tau=0.005,
                 layer1_size=512,
                 layer2_size=512,
                 batch_size=512,
                 reward_scale=2):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions

        self.actor = ActorNetwork(alpha,
                                  input_dims,
                                  n_actions=n_actions,
                                  name='actor',
                                  max_action=max_action)
        self.critic_1 = CriticNetwork(beta,
                                      input_dims,
                                      n_actions=n_actions,
                                      name='critic_1')
        self.critic_2 = CriticNetwork(beta,
                                      input_dims,
                                      n_actions=n_actions,
                                      name='critic_2')
        self.value = ValueNetwork(beta, input_dims, name='value')
        self.target_value = ValueNetwork(beta, input_dims, name='target_value')

        self.scale = reward_scale
        self.update_network_parameters(tau=1)
示例#22
0
def main(args):

    with tf.Session() as session:
        np.random.seed(int(args['random_seed']))
        tf.set_random_seed(int(args['random_seed']))

        # initialize ROS interface
        agent = fake.fake_agent()
        plant = fake.fake_plant()

        state_shape = agent.get_state_shape()
        action_shape = agent.get_action_shape()
        action_bound = agent.get_action_bound()

        # initialize function approximators
        actor_network = ActorNetwork(session,
                                     state_shape,
                                     action_shape,
                                     action_bound,
                                     float(args['actor_lr']),
                                     float(args['tau']),
                                     loss_mask=True)
        critic_network = CriticNetwork(session,
                                       state_shape,
                                       action_shape,
                                       float(args['critic_lr']),
                                       float(args['tau']),
                                       float(args['gamma']),
                                       actor_network.get_num_trainable_vars(),
                                       loss_mask=True)
        predictor_network = fake.fake_predictor()
        latent_network = fake.fake_latent()

        learn(session,
              actor_network,
              critic_network,
              predictor_network,
              agent,
              plant,
              latent_network=latent_network,
              buffer_size=int(args['buffer_size']),
              batch_size=int(args['batch_size']),
              trace_length=int(args['trace_length']),
              update_freq=int(args['update_freq']),
              pretrain_steps=int(args['pretrain_steps']),
              update_steps=int(args['update_steps']),
              max_episodes=int(args['max_episodes']),
              max_ep_steps=int(args['max_episode_len']),
              summary_dir=args['summary_dir'])
示例#23
0
    def __init__(self, alpha, beta, input_dims, tau, env,
            gamma=0.99, update_actor_interval=2, warmup=1000,
            n_actions=2, max_size=1000000, layer1_size=400,
            layer2_size=300, batch_size=100, noise=0.1):
        self.gamma = gamma
        self.tau = tau
        self.max_action = env.action_space.high
        self.min_action = env.action_space.low
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.learn_step_cntr = 0
        self.time_step = 0
        self.warmup = warmup
        self.n_actions = n_actions
        self.update_actor_iter = update_actor_interval

        self.actor = ActorNetwork(alpha, input_dims, layer1_size,
                                  layer2_size, n_actions=n_actions,
                                  name='actor')
        self.critic_1 = CriticNetwork(beta, input_dims, layer1_size,
                                      layer2_size, n_actions=n_actions,
                                      name='critic_1')
        self.critic_2 = CriticNetwork(beta, input_dims, layer1_size,
                                      layer2_size, n_actions=n_actions,
                                      name='critic_2')

        self.target_actor = ActorNetwork(alpha, input_dims, layer1_size,
                                         layer2_size, n_actions=n_actions,
                                         name='target_actor')
        self.target_critic_1 = CriticNetwork(beta, input_dims, layer1_size,
                                         layer2_size, n_actions=n_actions,
                                         name='target_critic_1')
        self.target_critic_2 = CriticNetwork(beta, input_dims, layer1_size,
                                         layer2_size, n_actions=n_actions,
                                         name='target_critic_2')

        self.noise = noise
        self.update_network_parameters(tau=1)
示例#24
0
 def __init__(self, input_dims, env, n_actions):
     self.memory = ReplayBuffer(input_dims)
     self.n_actions = n_actions
     self.actor_nn = ActorNetwork(input_dims,
                                  n_actions=n_actions,
                                  name=Constants.env_id + '_actor',
                                  max_action=env.action_space.n)
     self.critic_local_1_nn = CriticNetwork(input_dims,
                                            n_actions=n_actions,
                                            name=Constants.env_id +
                                            '_critic_local_1')
     self.critic_local_2_nn = CriticNetwork(input_dims,
                                            n_actions=n_actions,
                                            name=Constants.env_id +
                                            '_critic_local_2')
     self.critic_target_1_nn = CriticNetwork(input_dims,
                                             n_actions=n_actions,
                                             name=Constants.env_id +
                                             '_critic_target_1')
     self.critic_target_2_nn = CriticNetwork(input_dims,
                                             n_actions=n_actions,
                                             name=Constants.env_id +
                                             '_critic_target_2')
示例#25
0
PRIORITIZED = True
BATCH_SIZE = 64
RANDOM_SEED = 1234

# set up environment
env = gym.make(ENV_NAME)
bound = np.max(env.action_space.high)
state = state_prime = env.reset()
action = env.action_space.sample()
a_dim = len(action)
s_dim = len(state)

# initialize critic network Q(s, a|θQ) and actor μ(s|θμ) with weights θQ and θμ
actor = ActorNetwork(sess, state, action, ACTOR_LEARNING_RATE, TAU, bound)
explore = ExploreNetwork(sess, state, action, EXP_LEARNING_RATE, TAU, bound)
critic = CriticNetwork(sess, state, action, CRITIC_LEARNING_RATE, TAU)

# initialize variables and store tensorboard graph
sess.run(tf.global_variables_initializer())
summary_writer = tf.train.SummaryWriter("./tf_logs", graph=sess.graph)
summary_writer.close()

# initialize target network Q′ and μ′ with weights θQ′ ← θQ, θμ′ ← θμ
actor.update_target_network()
explore.update_target_network()
critic.update_target_network()

# initialize replay buffer
replay = ReplayBuffer(
    BUFFER_SIZE, random_seed=RANDOM_SEED, prioritized=PRIORITIZED
)
示例#26
0
文件: main.py 项目: yosider/RLSnipets
def main():
    with tf.Session() as sess:

        actor = ActorNetwork(sess, STATE_DIM, ACTION_DIM, ACTION_BOUND,
                             ACTOR_LEARNING_RATE, TAU, MINIBATCH_SIZE)
        critic = CriticNetwork(sess, STATE_DIM, ACTION_DIM,
                               CRITIC_LEARNING_RATE, TAU,
                               actor.get_num_trainable_vars())

        #actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(ACTION_DIM))

        #TODO: Ornstein-Uhlenbeck noise.

        sess.run(tf.global_variables_initializer())

        # initialize target net
        actor.update_target_network()
        critic.update_target_network()

        # initialize replay memory
        replay_buffer = ReplayBuffer(BUFFER_SIZE)

        # main loop.
        for ep in range(MAX_EPISODES):

            episode_reward = 0
            ep_batch_avg_q = 0

            s = ENV.reset()

            for step in range(MAX_EP_STEPS):

                a = actor.predict(np.reshape(s,
                                             (1, STATE_DIM)))  #+ actor_noise()
                s2, r, terminal, info = ENV.step(a[0])
                #print(s2)

                replay_buffer.add(np.reshape(s, (STATE_DIM,)), \
                                np.reshape(a, (ACTION_DIM,)), \
                                r, \
                                terminal, \
                                np.reshape(s2, (STATE_DIM,)))

                # Batch sampling.
                if replay_buffer.size() > MINIBATCH_SIZE and \
                    step % TRAIN_INTERVAL == 0:
                    s_batch, a_batch, r_batch, t_batch, s2_batch = \
                        replay_buffer.sample_batch(MINIBATCH_SIZE)

                    # target Q値を計算.
                    target_action = actor.predict_target(s2_batch)
                    target_q = critic.predict_target(s2_batch, target_action)

                    # critic の target V値を計算.
                    targets = []
                    for i in range(MINIBATCH_SIZE):
                        if t_batch[i]:
                            # terminal
                            targets.append(r_batch[i])
                        else:
                            targets.append(r_batch[i] + GAMMA * target_q[i])

                    # Critic を train.
                    #TODO: predQはepisodeではなくrandom batchなのでepisode_avg_maxという統計は不適切.
                    pred_q, _ = critic.train(
                        s_batch, a_batch,
                        np.reshape(targets, (MINIBATCH_SIZE, 1)))

                    # Actor を train.
                    a_outs = actor.predict(s_batch)
                    grads = critic.action_gradients(s_batch, a_outs)
                    #print(grads[0].shape)
                    #exit(1)
                    actor.train(s_batch, grads[0])

                    # Update target networks.
                    # 数batchに一度にするべき?
                    actor.update_target_network()
                    critic.update_target_network()

                    ep_batch_avg_q += np.mean(pred_q)

                s = s2
                episode_reward += r

                if terminal:
                    print('Episode:', ep, 'Reward:', episode_reward)
                    reward_log.append(episode_reward)
                    q_log.append(ep_batch_avg_q / step)

                    break
示例#27
0
class Agent:
    def __init__(self,
                 n_actions,
                 input_dims,
                 gamma=0.99,
                 alpha=0.0003,
                 gae_lambda=0.95,
                 policy_clip=0.2,
                 batch_size=64,
                 n_epochs=10,
                 chkpt_dir='models/'):
        self.gamma = gamma
        self.policy_clip = policy_clip
        self.n_epochs = n_epochs
        self.gae_lambda = gae_lambda
        self.chkpt_dir = chkpt_dir

        self.actor = ActorNetwork(n_actions)
        self.actor.compile(optimizer=Adam(learning_rate=alpha))
        self.critic = CriticNetwork()
        self.critic.compile(optimizer=Adam(learning_rate=alpha))
        self.memory = PPOMemory(batch_size)

    def store_transition(self, state, action, probs, vals, reward, done):
        self.memory.store_memory(state, action, probs, vals, reward, done)

    def save_models(self):
        print('... saving models ...')
        self.actor.save(self.chkpt_dir + 'actor')
        self.critic.save(self.chkpt_dir + 'critic')

    def load_models(self):
        print('... loading models ...')
        self.actor = keras.models.load_model(self.chkpt_dir + 'actor')
        self.critic = keras.models.load_model(self.chkpt_dir + 'critic')

    def choose_action(self, observation):
        state = tf.convert_to_tensor([observation])

        probs = self.actor(state)
        dist = tfp.distributions.Categorical(probs)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        value = self.critic(state)

        action = action.numpy()[0]
        value = value.numpy()[0]
        log_prob = log_prob.numpy()[0]

        return action, log_prob, value

    def learn(self):
        for _ in range(self.n_epochs):
            state_arr, action_arr, old_prob_arr, vals_arr,\
                reward_arr, dones_arr, batches = \
                self.memory.generate_batches()

            values = vals_arr
            advantage = np.zeros(len(reward_arr), dtype=np.float32)

            for t in range(len(reward_arr) - 1):
                discount = 1
                a_t = 0
                for k in range(t, len(reward_arr) - 1):
                    a_t += discount * (reward_arr[k] +
                                       self.gamma * values[k + 1] *
                                       (1 - int(dones_arr[k])) - values[k])
                    discount *= self.gamma * self.gae_lambda
                advantage[t] = a_t

            for batch in batches:
                with tf.GradientTape(persistent=True) as tape:
                    states = tf.convert_to_tensor(state_arr[batch])
                    old_probs = tf.convert_to_tensor(old_prob_arr[batch])
                    actions = tf.convert_to_tensor(action_arr[batch])

                    probs = self.actor(states)
                    dist = tfp.distributions.Categorical(probs)
                    new_probs = dist.log_prob(actions)

                    critic_value = self.critic(states)

                    critic_value = tf.squeeze(critic_value, 1)

                    prob_ratio = tf.math.exp(new_probs - old_probs)
                    weighted_probs = advantage[batch] * prob_ratio
                    clipped_probs = tf.clip_by_value(prob_ratio,
                                                     1 - self.policy_clip,
                                                     1 + self.policy_clip)
                    weighted_clipped_probs = clipped_probs * advantage[batch]
                    actor_loss = -tf.math.minimum(weighted_probs,
                                                  weighted_clipped_probs)
                    actor_loss = tf.math.reduce_mean(actor_loss)

                    returns = advantage[batch] + values[batch]
                    # critic_loss = tf.math.reduce_mean(tf.math.pow(
                    #                                  returns-critic_value, 2))
                    critic_loss = keras.losses.MSE(critic_value, returns)

                actor_params = self.actor.trainable_variables
                actor_grads = tape.gradient(actor_loss, actor_params)
                critic_params = self.critic.trainable_variables
                critic_grads = tape.gradient(critic_loss, critic_params)
                self.actor.optimizer.apply_gradients(
                    zip(actor_grads, actor_params))
                self.critic.optimizer.apply_gradients(
                    zip(critic_grads, critic_params))

        self.memory.clear_memory()
class Agent:
    def __init__(self,
                 alpha=0.0003,
                 beta=0.0003,
                 input_dims=[8],
                 env=None,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 tau=0.005,
                 layer1_size=256,
                 layer2_size=256,
                 batch_size=256,
                 reward_scale=2):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions

        self.actor = ActorNetwork(n_actions=n_actions,
                                  name='actor',
                                  max_action=env.action_space.high)
        self.critic_1 = CriticNetwork(n_actions=n_actions, name='critic_1')
        self.critic_2 = CriticNetwork(n_actions=n_actions, name='critic_2')
        self.value = ValueNetwork(name='value')
        self.target_value = ValueNetwork(name='target_value')

        self.actor.compile(optimizer=Adam(learning_rate=alpha))
        self.critic_1.compile(optimizer=Adam(learning_rate=beta))
        self.critic_2.compile(optimizer=Adam(learning_rate=beta))
        self.value.compile(optimizer=Adam(learning_rate=beta))
        self.target_value.compile(optimizer=Adam(learning_rate=beta))

        self.scale = reward_scale
        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        state = tf.convert_to_tensor([observation])
        actions, _ = self.actor.sample_normal(state, reparameterize=False)

        return actions[0]

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        weights = []
        targets = self.target_value.weights
        for i, weight in enumerate(self.value.weights):
            weights.append(weight * tau + targets[i] * (1 - tau))

        self.target_value.set_weights(weights)

    def save_models(self):
        print('... saving models ...')
        self.actor.save_weights(self.actor.checkpoint_file)
        self.critic_1.save_weights(self.critic_1.checkpoint_file)
        self.critic_2.save_weights(self.critic_2.checkpoint_file)
        self.value.save_weights(self.value.checkpoint_file)
        self.target_value.save_weights(self.target_value.checkpoint_file)

    def load_models(self):
        print('... loading models ...')
        self.actor.load_weights(self.actor.checkpoint_file)
        self.critic_1.load_weights(self.critic_1.checkpoint_file)
        self.critic_2.load_weights(self.critic_2.checkpoint_file)
        self.value.load_weights(self.value.checkpoint_file)
        self.target_value.load_weights(self.target_value.checkpoint_file)

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        state, action, reward, new_state, done = \
                self.memory.sample_buffer(self.batch_size)

        states = tf.convert_to_tensor(state, dtype=tf.float32)
        states_ = tf.convert_to_tensor(new_state, dtype=tf.float32)
        rewards = tf.convert_to_tensor(reward, dtype=tf.float32)
        actions = tf.convert_to_tensor(action, dtype=tf.float32)

        with tf.GradientTape() as tape:
            value = tf.squeeze(self.value(states), 1)
            value_ = tf.squeeze(self.target_value(states_), 1)

            current_policy_actions, log_probs = self.actor.sample_normal(
                states, reparameterize=False)
            log_probs = tf.squeeze(log_probs, 1)
            q1_new_policy = self.critic_1(states, current_policy_actions)
            q2_new_policy = self.critic_2(states, current_policy_actions)
            critic_value = tf.squeeze(
                tf.math.minimum(q1_new_policy, q2_new_policy), 1)

            value_target = critic_value - log_probs
            value_loss = 0.5 * keras.losses.MSE(value, value_target)

        value_network_gradient = tape.gradient(value_loss,
                                               self.value.trainable_variables)
        self.value.optimizer.apply_gradients(
            zip(value_network_gradient, self.value.trainable_variables))

        with tf.GradientTape() as tape:
            # in the original paper, they reparameterize here. We don't implement
            # this so it's just the usual action.
            new_policy_actions, log_probs = self.actor.sample_normal(
                states, reparameterize=True)
            log_probs = tf.squeeze(log_probs, 1)
            q1_new_policy = self.critic_1(states, new_policy_actions)
            q2_new_policy = self.critic_2(states, new_policy_actions)
            critic_value = tf.squeeze(
                tf.math.minimum(q1_new_policy, q2_new_policy), 1)

            actor_loss = log_probs - critic_value
            actor_loss = tf.math.reduce_mean(actor_loss)

        actor_network_gradient = tape.gradient(actor_loss,
                                               self.actor.trainable_variables)
        self.actor.optimizer.apply_gradients(
            zip(actor_network_gradient, self.actor.trainable_variables))

        with tf.GradientTape(persistent=True) as tape:
            # I didn't know that these context managers shared values?
            q_hat = self.scale * reward + self.gamma * value_ * (1 - done)
            q1_old_policy = tf.squeeze(self.critic_1(state, action), 1)
            q2_old_policy = tf.squeeze(self.critic_2(state, action), 1)
            critic_1_loss = 0.5 * keras.losses.MSE(q1_old_policy, q_hat)
            critic_2_loss = 0.5 * keras.losses.MSE(q2_old_policy, q_hat)

        critic_1_network_gradient = tape.gradient(
            critic_1_loss, self.critic_1.trainable_variables)
        critic_2_network_gradient = tape.gradient(
            critic_2_loss, self.critic_2.trainable_variables)

        self.critic_1.optimizer.apply_gradients(
            zip(critic_1_network_gradient, self.critic_1.trainable_variables))
        self.critic_2.optimizer.apply_gradients(
            zip(critic_2_network_gradient, self.critic_2.trainable_variables))

        self.update_network_parameters()
class Agent():
    def __init__(self,
                 alpha,
                 beta,
                 input_dims,
                 tau,
                 n_actions,
                 gamma=0.99,
                 max_size=50000,
                 fc1_dims=400,
                 fc2_dims=300,
                 batch_size=32):
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.alpha = alpha
        self.beta = beta
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.noise = OUActionNoise(mu=np.zeros(n_actions))
        self.actor = ActorNetwork(alpha,
                                  input_dims,
                                  fc1_dims,
                                  fc2_dims,
                                  n_actions=n_actions,
                                  name='actor')
        self.critic = CriticNetwork(beta,
                                    input_dims,
                                    fc1_dims,
                                    fc2_dims,
                                    n_actions=n_actions,
                                    name='critic')

        self.target_actor = ActorNetwork(alpha,
                                         input_dims,
                                         fc1_dims,
                                         fc2_dims,
                                         n_actions=n_actions,
                                         name='target_actor')
        self.target_critic = CriticNetwork(beta,
                                           input_dims,
                                           fc1_dims,
                                           fc2_dims,
                                           n_actions=n_actions,
                                           name='target_critic')

        self.update_network_parameters(
            tau=1)  # for the first time target_actor and actor are same

    def choose_action(self, observation):
        self.actor.eval(
        )  # we are setting our actor network to eval mode because we have batch normalization layer
        # and we dont want to calculate statistics for that layer at this step
        state = T.tensor([observation], dtype=T.float).to(self.actor.device)
        mu = self.actor.forward(state).to(self.actor.device)
        mu_prime = mu + T.tensor(self.noise(), dtype=T.float).to(
            self.actor.device)
        self.actor.train()
        return mu_prime.cpu().detach().numpy()[0]

    def remember(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def save_models(self):
        self.actor.save_checkpoint()
        self.target_actor.save_checkpoint()
        self.critic.save_checkpoint()
        self.target_critic.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.critic.load_checkpoint()
        self.target_critic.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        states, actions, rewards, states_, done = self.memory.sample_buffer(
            self.batch_size)

        states = T.tensor(states, dtype=T.float).to(self.actor.device)
        states_ = T.tensor(states_, dtype=T.float).to(self.actor.device)
        actions = T.tensor(actions, dtype=T.float).to(self.actor.device)
        rewards = T.tensor(rewards, dtype=T.float).to(self.actor.device)
        done = T.tensor(done).to(self.actor.device)

        target_actions = self.target_actor.forward(states_)
        critic_value_ = self.target_critic.forward(states_, target_actions)
        critic_value = self.critic.forward(states, actions)

        critic_value_[done] = 0.0
        critic_value_ = critic_value_.view(-1)

        target = rewards + self.gamma * critic_value_
        target = target.view(self.batch_size, 1)

        self.critic.optimizer.zero_grad()
        critic_loss = F.mse_loss(target, critic_value)
        critic_loss.backward()
        self.critic.optimizer.step()

        self.actor.optimizer.zero_grad()
        actor_loss = -self.critic.forward(states, self.actor.forward(states))
        actor_loss = T.mean(actor_loss)
        actor_loss.backward()
        self.actor.optimizer.step()

        self.update_network_parameters(
        )  # sending tau None so that local tau variable there takes the value of class tau variable

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        critic_params = self.critic.named_parameters()
        target_actor_params = self.target_actor.named_parameters()
        target_critic_params = self.target_critic.named_parameters()

        critic_state_dict = dict(critic_params)
        actor_state_dict = dict(actor_params)
        target_critic_state_dict = dict(target_critic_params)
        target_actor_state_dict = dict(target_actor_params)

        for name in critic_state_dict:
            critic_state_dict[name] = tau * critic_state_dict[name].clone() + (
                1 - tau) * target_critic_state_dict[name].clone()

        for name in actor_state_dict:
            actor_state_dict[name] = tau * actor_state_dict[name].clone() + (
                1 - tau) * target_actor_state_dict[name].clone()

        self.target_critic.load_state_dict(critic_state_dict)
        self.target_actor.load_state_dict(actor_state_dict)
示例#30
0
import torch as T
from networks import ActorNetwork, CriticNetwork
import gym_lqr
if __name__ == '__main__':

    #env = gym.make('InvertedPendulumPyBulletEnv-v0')
    #env = gym.make('gym_lqr:lqr-stochastic-v0')
    env = gym.make('gym_lqr:lqr-v0')
    #env = gym.make('InvertedPendulum-v2')

    #print(env.action_space.shape[0])
    actor = ActorNetwork(0.0003, input_dims=env.observation_space.shape, \
                         n_actions=env.action_space.shape[0], max_action=env.action_space.high)

    critic_1 = CriticNetwork(0.0003, input_dims=env.observation_space.shape, \
                    n_actions=env.action_space.shape[0], name='critic_1')
    critic_2 = CriticNetwork(0.0003, input_dims=env.observation_space.shape, \
                    n_actions=env.action_space.shape[0], name='critic_2')

    ActorNetwork.load_checkpoint(actor)
    critic_1.load_checkpoint()
    critic_2.load_checkpoint()

    # Load optimal P
    env.set_P(np.load('tmp/sac/optimal_P.npy'))

    # Create States and Actions
    states = np.expand_dims(
        np.expand_dims(np.arange(-100, 100, 5, dtype=np.float32), -1), -1)
    actions = np.expand_dims(
        np.expand_dims(np.arange(-10, 10, 0.5, dtype=np.float32), -1), -1)
class Agent:
    def __init__(self,
                 input_dims,
                 alpha=0.001,
                 beta=0.002,
                 env=None,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 tau=0.005,
                 fc1=400,
                 fc2=300,
                 batch_size=64,
                 noise=0.1):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions
        self.noise = noise
        self.max_action = env.action_space.high[0]
        self.min_action = env.action_space.low[0]

        self.actor = ActorNetwork(n_actions=n_actions, name='actor')
        self.critic = CriticNetwork(name='critic')
        self.target_actor = ActorNetwork(n_actions=n_actions,
                                         name='target_actor')
        self.target_critic = CriticNetwork(name='target_critic')

        self.actor.compile(optimizer=Adam(learning_rate=alpha))
        self.critic.compile(optimizer=Adam(learning_rate=beta))
        self.target_actor.compile(optimizer=Adam(learning_rate=alpha))
        self.target_critic.compile(optimizer=Adam(learning_rate=beta))

        self.update_network_parameters(tau=1)

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        weights = []
        targets = self.target_actor.weights
        for i, weight in enumerate(self.actor.weights):
            weights.append(weight * tau + targets[i] * (1 - tau))
        self.target_actor.set_weights(weights)

        weights = []
        targets = self.target_critic.weights
        for i, weight in enumerate(self.critic.weights):
            weights.append(weight * tau + targets[i] * (1 - tau))
        self.target_critic.set_weights(weights)

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def save_models(self):
        print('... saving models ...')
        self.actor.save_weights(self.actor.checkpoint_file)
        self.target_actor.save_weights(self.target_actor.checkpoint_file)
        self.critic.save_weights(self.critic.checkpoint_file)
        self.target_critic.save_weights(self.target_critic.checkpoint_file)

    def load_models(self):
        print('... loading models ...')
        self.actor.load_weights(self.actor.checkpoint_file)
        self.target_actor.load_weights(self.target_actor.checkpoint_file)
        self.critic.load_weights(self.critic.checkpoint_file)
        self.target_critic.load_weights(self.target_critic.checkpoint_file)

    def choose_action(self, observation, evaluate=False):
        state = tf.convert_to_tensor([observation], dtype=tf.float32)
        actions = self.actor(state)
        if not evaluate:
            actions += tf.random.normal(shape=[self.n_actions],
                                        mean=0.0,
                                        stddev=self.noise)
        # note that if the env has an action > 1, we have to multiply by
        # max action at some point
        actions = tf.clip_by_value(actions, self.min_action, self.max_action)

        return actions[0]

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        state, action, reward, new_state, done = \
            self.memory.sample_buffer(self.batch_size)

        states = tf.convert_to_tensor(state, dtype=tf.float32)
        states_ = tf.convert_to_tensor(new_state, dtype=tf.float32)
        rewards = tf.convert_to_tensor(reward, dtype=tf.float32)
        actions = tf.convert_to_tensor(action, dtype=tf.float32)

        with tf.GradientTape() as tape:
            target_actions = self.target_actor(states_)
            critic_value_ = tf.squeeze(
                self.target_critic(states_, target_actions), 1)
            critic_value = tf.squeeze(self.critic(states, actions), 1)
            target = rewards + self.gamma * critic_value_ * (1 - done)
            critic_loss = keras.losses.MSE(target, critic_value)

        critic_network_gradient = tape.gradient(
            critic_loss, self.critic.trainable_variables)
        self.critic.optimizer.apply_gradients(
            zip(critic_network_gradient, self.critic.trainable_variables))

        with tf.GradientTape() as tape:
            new_policy_actions = self.actor(states)
            actor_loss = -self.critic(states, new_policy_actions)
            actor_loss = tf.math.reduce_mean(actor_loss)

        actor_network_gradient = tape.gradient(actor_loss,
                                               self.actor.trainable_variables)
        self.actor.optimizer.apply_gradients(
            zip(actor_network_gradient, self.actor.trainable_variables))

        self.update_network_parameters()
示例#32
0
            loc=mu,
            scale=sigma
        ))
    return np.array(noise)

# set up environment
env = gym.make(ENV_NAME)
bound = np.max(env.action_space.high)
state = state_prime = env.reset()
action = env.action_space.sample()
a_dim = len(action)
s_dim = len(state)

# initialize critic network Q(s, a|θQ) and actor μ(s|θμ) with weights θQ and θμ
actor = ActorNetwork(sess, state, action, ACTOR_LEARNING_RATE, TAU, bound)
critic = CriticNetwork(sess, state, action, CRITIC_LEARNING_RATE, TAU)

# initialize variables and store tensorboard graph
sess.run(tf.initialize_all_variables())
summary_writer = tf.train.SummaryWriter("./tf_logs", graph=sess.graph)
summary_writer.close()

# initialize target network Q′ and μ′ with weights θQ′ ← θQ, θμ′ ← θμ
actor.update_target_network()
critic.update_target_network()

# initialize replay buffer
replay = ReplayBuffer(
    BUFFER_SIZE, random_seed=RANDOM_SEED, prioritized=PRIORITIZED
)