Пример #1
0
    def __init__(self, state_size, action_size, num_agents):
        """ Initialize agent.

        Params
        ======
        state_size (integer): Size of input state vector
        action_size (integer): Size of action vector
        num_agents (integer): Number of simultaneous agents in the environment
        """

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents

        # Actor
        self.local_actor_network = ActorNetwork(state_size, action_size)
        self.target_actor_network = ActorNetwork(state_size, action_size)
        self.actor_optimizer = optim.Adam(
            self.local_actor_network.parameters(), lr=ACTOR_LEARNING_RATE)

        # Critic
        self.local_critic_network = CriticNetwork(state_size, action_size)
        self.target_critic_network = CriticNetwork(state_size, action_size)
        self.critic_optimizer = optim.Adam(
            self.local_critic_network.parameters(),
            lr=CRITIC_LEARNING_RATE,
            weight_decay=CRITIC_WEIGHT_DECAY)

        self.replay_buffer = ReplayBuffer(action_size, REPLAY_BUFFER_SIZE,
                                          None)
        self.steps = 0
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.random_process = OrnsteinUhlenbeckProcess(
            (num_agents, action_size), sigma=RANDOM_SIGMA, theta=RANDOM_THETA)
Пример #2
0
    def __init__(self, state_size, action_size, num_agents):
        """ Initialize agent.

        Params
        ======
        state_size (integer): Size of input state vector
        action_size (integer): Size of action vector
        num_agents (integer): Number of simultaneous agents in the environment
        """

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents

        # Actor
        self.actor = ActorNetwork(state_size, action_size)
        self.actor_target = ActorNetwork(state_size, action_size)
        self.soft_update(self.actor_target.parameters(),
                         self.actor.parameters(), 1)
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=ACTOR_LEARNING_RATE)

        # Create one critic per agent
        self.critics = []
        self.critic_targets = []
        self.critic_optimizers = []
        for i in range(num_agents):

            # Critic
            # Note: we use action_size * num_agents since we'll pass in the actions of all agents concatenated
            critic = CriticNetwork(state_size * num_agents,
                                   action_size * num_agents)
            self.critics.append(critic)
            self.critic_targets.append(
                CriticNetwork(state_size * num_agents,
                              action_size * num_agents))
            self.soft_update(self.critic_targets[-1].parameters(),
                             critic.parameters(), 1)
            self.critic_optimizers.append(
                optim.Adam(critic.parameters(),
                           lr=CRITIC_LEARNING_RATE,
                           weight_decay=CRITIC_WEIGHT_DECAY))

        self.replay_buffer = ReplayBuffer(action_size, REPLAY_BUFFER_SIZE,
                                          None)
        self.steps = 0
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.random_process = OrnsteinUhlenbeckProcess((1, action_size),
                                                       sigma=RANDOM_SIGMA,
                                                       theta=RANDOM_THETA)
Пример #3
0
    def __init__(self, env, replay_buffer, sample_batch, train_iter, gamma,
                 tau, batch_size, n_train, n_episode):
        # Gym environment
        self.env = env

        env_flattened = gym.wrappers.FlattenDictWrapper(
            env, dict_keys=['observation', 'achieved_goal', 'desired_goal'])

        # Get space sizes
        self.state_dim = env_flattened.observation_space.shape[0]
        #self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]

        # Get replay buffer and function get a batch from it
        self.replay_buffer = replay_buffer
        self.sample_batch = sample_batch

        self.sess = tf.InteractiveSession()

        # Hyper parameters
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.n_train = n_train
        self.n_episode = n_episode

        # Initialize networks
        self.critic = CriticNetwork(self.sess, self.state_dim, self.action_dim)
        self.actor = ActorNetwork(self.sess, self.state_dim, self.action_dim)

        self.exploration_noise = OUNoise(self.action_dim)
    def __init__(self,
                 input_dim,
                 action_dim,
                 critic_layers,
                 actor_layers,
                 actor_activation,
                 scope='ac_network'):

        self.input_dim = input_dim
        self.action_dim = action_dim
        self.scope = scope

        self.x = tf.placeholder(shape=(None, input_dim),
                                dtype=tf.float32,
                                name='x')
        self.y = tf.placeholder(shape=(None, ), dtype=tf.float32, name='y')

        with tf.variable_scope(scope):
            self.actor_network = ActorNetwork(self.x,
                                              action_dim,
                                              hidden_layers=actor_layers,
                                              activation=actor_activation)

            self.critic_network = CriticNetwork(
                self.x,
                self.actor_network.get_output_layer(),
                hidden_layers=critic_layers)

            self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                          tf.get_variable_scope().name)
            self._build()
Пример #5
0
    def __init__(self, sess, number, model_path, global_episodes, explore,
                 decay, training):
        self.name = 'worker_' + str(number)  # name for uploading results
        self.number = number
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = 41
        self.action_dim = 18
        self.model_path = model_path
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.sess = sess
        self.explore = explore
        self.decay = decay
        self.training = training

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim,
                                          self.name + '/actor')
        self.actor_network.update_target(self.sess)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim,
                                            self.name + '/critic')
        self.critic_network.update_target(self.sess)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        self.update_local_ops_actor = update_target_graph(
            'global/actor', self.name + '/actor')
        self.update_local_ops_critic = update_target_graph(
            'global/critic', self.name + '/critic')
Пример #6
0
    def __init__(self, env_name, state_dim, action_dim):
        self.name = 'DDPG'  # name for uploading results
        self.env_name = env_name
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = state_dim
        self.action_dim = action_dim

        # Ensure action bound is symmetric
        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.OU = OU()

        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(save_location)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print("Successfully loaded:", checkpoint.model_checkpoint_path)
        else:
            print("Could not find old network weights")
Пример #7
0
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        # self.state_dim = env.observation_space.shape[0] * 2
        self.action_dim = env.action_space.shape[0]

        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        # self.exploration_noise = OUNoise(self.action_dim)
        self.exploration_noise = OUNoise()
        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(MODEL_PATH)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            my_config.logger.warn("Successfully loaded: %s" %
                                  (checkpoint.model_checkpoint_path))
        else:
            my_config.logger.error("Could not find old network weights")
Пример #8
0
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        # self.state_dim = env.observation_space.shape[0]
        # self.action_dim = env.action_space.shape[0]
        self.state_dim = env.state_size
        self.action_dim = env.action_size
        self.action_bound = (env.action_high - env.action_low) / 2

        print('state_dim: ', self.state_dim, 'action_dim: ', self.action_dim,
              'action_bound: ', self.action_bound)

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim, self.action_bound)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim, self.action_bound)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
Пример #9
0
    def __init__(self, env, DIRECTORY):
        self.batch_size = BATCH_SIZE
        self.replay_start_size = REPLAY_START_SIZE  # self.sub_batch_size = BATCH_SIZE / n_gpu

        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession(config=tf.ConfigProto(
            allow_soft_placement=True, log_device_placement=False))

        self.trace_length = TRACE_LENGTH
        self.temp_abstract = TEMP_ABSTRACT
        self.actor_network = ActorNetwork(self.sess, BATCH_SIZE,
                                          self.state_dim, self.action_dim,
                                          self.temp_abstract, DIRECTORY)
        self.critic_network = CriticNetwork(self.sess, BATCH_SIZE,
                                            self.state_dim, self.action_dim,
                                            self.temp_abstract, DIRECTORY)

        # initialize replay buffer
        max_len_trajectory = self.environment.spec.timestep_limit + 1  # trace_length
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE, DIRECTORY,
                                          max_len_trajectory,
                                          self.actor_network.last_epi)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        ###
        self.diff = 0.
        self.discounting_mat_dict = {}
Пример #10
0
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        self.epsilon_expert_range = (1.0, 0.1)
        self.epsilon_expert = self.epsilon_expert_range[0]
        self.epsilon_random_range = (0.1, 0.01)
        self.epsilon_random = self.epsilon_random_range[0]
        # Randomly initialize actor network and critic network
        # with both their target networks
        # self.state_dim = env.observation_space.shape[0]
        self.state_dim = 16
        # self.action_dim = env.action_space.shape[0]
        self.action_dim = 3
        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        # self.exploration_noise = OUNoise(self.action_dim)
        # self.exploration_noise = OUNoise()
        self.OU = OU()
        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(MODEL_PATH)
        if checkpoint and checkpoint.model_checkpoint_path:
            path = checkpoint.model_checkpoint_path
            self.saver.restore(self.sess, path)
            self.time_step = int(path[path.rindex('-') + 1:])
            self.epsilon_expert -= (
                self.epsilon_expert_range[0] -
                self.epsilon_expert_range[1]) * self.time_step / EXPLORE_COUNT
            self.epsilon_expert = max(self.epsilon_expert,
                                      self.epsilon_expert_range[1])
            self.epsilon_random -= (
                self.epsilon_random_range[0] -
                self.epsilon_random_range[1]) * self.time_step / EXPLORE_COUNT
            self.epsilon_random = max(self.epsilon_random,
                                      self.epsilon_random_range[1])
            logger.warn(
                "Successfully loaded: %s, step: %d, epsilon_expert: %s, epsilon_random: %s"
                % (path, self.time_step, self.epsilon_expert,
                   self.epsilon_random))
        else:
            logger.warn("Could not find old network weights")

        self.critic_cost = 0
Пример #11
0
    def __init__(self, env, state_dim, action_dim):
        self.name = 'DDPG'
        self.environment = env
        self.time_step = 0
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
Пример #12
0
    def __init__(self, sess, data_fname):
        self.name = 'DDPG'
        # Randomly initialize actor network and critic network
        # with both their target networks

        self.name = 'DDPG'  # name for uploading results
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = Hp.state_dim
        self.action_dim = Hp.action_dim
        print(self.state_dim, self.action_dim)

        self.sess = sess

        self.state_input = [
            tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord))
            for _ in xrange(Hp.categories)
        ]
        #tf.placeholder("float",[None,self.state_dim])
        self.target_state_input = [
            tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord))
            for _ in xrange(Hp.categories)
        ]
        #tf.placeholder("float",[None,self.state_dim])
        self.state_network = StateEnc(self.sess, self.state_input,
                                      self.target_state_input)
        state_batch = self.state_network.encoding
        next_state_batch = self.state_network.target_encoding

        weights, biases, w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 = self.state_network.get_parameters(
        )

        state_network_params = weights + biases + [
            w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2
        ]

        self.actor_network = ActorNetwork(self.sess, Hp.n_hidden,
                                          self.action_dim, self.state_input,
                                          state_batch, next_state_batch,
                                          state_network_params)
        self.critic_network = CriticNetwork(self.sess, Hp.n_hidden,
                                            self.action_dim, state_batch,
                                            next_state_batch)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(Hp.REPLAY_BUFFER_SIZE, data_fname)
        self.summary_str2 = None

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
Пример #13
0
    def __init__(self, track_name='practgt2.xml'):
        BUFFER_SIZE = 100000
        TAU = 0.001  # Target Network HyperParameters
        LRA = 0.0001  # Learning rate for Actor
        LRC = 0.001  # Lerning rate for Critic
        state_dim = 29  # of sensors input
        self.batch_size = 32
        self.lambda_mix = 10.0
        self.action_dim = 3  # Steering/Acceleration/Brake

        # Tensorflow GPU optimization
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        from keras import backend as K
        K.set_session(sess)

        self.actor = ActorNetwork(sess, state_dim, self.action_dim,
                                  self.batch_size, TAU, LRA)
        self.critic = CriticNetwork(sess, state_dim, self.action_dim,
                                    self.batch_size, TAU, LRC)
        self.buff = ReplayBuffer(BUFFER_SIZE)  # Create replay buffer
        self.track_name = track_name

        self.save = dict(total_reward=[],
                         total_step=[],
                         ave_reward=[],
                         distRaced=[],
                         distFromStart=[],
                         lastLapTime=[],
                         curLapTime=[],
                         lapTimes=[],
                         avelapTime=[],
                         ave_sp=[],
                         max_sp=[],
                         min_sp=[],
                         test_total_reward=[],
                         test_total_step=[],
                         test_ave_reward=[],
                         test_distRaced=[],
                         test_distFromStart=[],
                         test_lastLapTime=[],
                         test_curLapTime=[],
                         test_lapTimes=[],
                         test_avelapTime=[],
                         test_ave_sp=[],
                         test_max_sp=[],
                         test_min_sp=[])
Пример #14
0
	def __init__(self, env):
		self.sess = tf.InteractiveSession()
		#self.params = loadparams() # ???
		self.env = env
		self.n_states = env.observation_space.shape[0]
		self.n_actions = env.action_space.shape[0]
		self.low = self.env.action_space.low
		self.high = self.env.action_space.high
		self.actor_network = ActorNetwork(self.sess, self.n_states, self.n_actions)
		self.trainable_var_count = self.actor_network.get_trainable_var_count()
		self.critic_network = CriticNetwork(self.sess, self.n_states, self.n_actions, \
			self.actor_network, self.trainable_var_count)
		self.replay_buffer = ReplayBuffer(BUFFER_SIZE) #params['buffer_size']???
		self.exploration_noise = OUNoise(self.n_actions)
		# self.noise = Noise()
		self.gamma = GAMMA
		self.sess.run(tf.global_variables_initializer())
    def __init__(self, env,device):
        self.name = 'DDPG' # name for uploading results
        self.environment = env
        self.device=device
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        
        self.actor_network = ActorNetwork(self.state_dim,self.action_dim)
        self.critic_network = CriticNetwork(self.state_dim,self.action_dim)
        
        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
Пример #16
0
    def __init__(self, state_dim, action_dim, env):
        self.name = 'DDPG'  # name for uploading results
        self.time_step = 0
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.environment = env
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
Пример #17
0
 def __init__(self, num_agents, state_dim, action_dim):
     # track training times
     self.time_step = 0
     # use set session use GPU
     #self.sess = tf.InteractiveSession()
     self.sess = tf.Session(config=tf.ConfigProto(
         log_device_placement=True))
     self.num_agents = num_agents
     self.state_dim = state_dim
     self.action_dim = action_dim
     self.agents = self.create_multi_agents(self.sess, num_agents,
                                            self.state_dim, self.action_dim)
     # make sure create Criticnetwork later, summarise mean Q value inside
     self.critic = CriticNetwork(self.sess, state_dim, action_dim)
     self.exploration_noise = OUNoise((self.num_agents, action_dim))
     self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
     # for store checkpoint
     self.saver = tf.train.Saver()
Пример #18
0
    def initialize(self):

        self.net = {}
        self.opt = {}

        if self.pars["twin"]:
            self.twins = ["_1", "_2"]
        else:
            self.twins = [""]

        for netname in ["act_loc", "act_tg"]:
            self.net[netname] = LinearNetwork(
                input_shape=self.pars["state_size"],
                lin_layers=self.pars["actor_layers"],
                output_shape=self.pars["act_size"],
                seed=self.pars["seed"])
        self.opt["act"] = torch.optim.Adam(self.net["act_loc"].parameters(),
                                           lr=self.pars["lr_act"])

        for twin in self.twins:
            for netname in ["crit_loc", "crit_tg"]:
                self.net[netname + twin] = CriticNetwork(
                    input_shape=self.pars["state_size"],
                    lin_layers=self.pars["crit_layers"],
                    output_shape=(1, ),
                    action_layer=self.pars["act_input"],
                    action_shape=self.pars["act_size"],
                    seed=self.pars["seed"])
            self.opt["crit" + twin] = torch.optim.Adam(
                self.net["crit_loc" + twin].parameters(),
                lr=self.pars["lr_crit"])

        if self.pars["erep_eps"] < 1.0:
            self.mem = ExperienceReplayer(
                self.pars["erep_size"],
                wait_fill=self.pars["erep_fill"],
                default_prio=self.pars["erep_def_prio"],
                epsilon=self.pars["erep_eps"])
        else:
            self.mem = ExperienceReplayer(self.pars["erep_size"],
                                          wait_fill=self.pars["erep_fill"])

        self.train_cr_count = 0
Пример #19
0
    def __init__(self, state_dim, action_dim):
        self.name = 'DDPG'  # name for uploading results
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = state_dim
        self.action_dim = action_dim

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
Пример #20
0
    def __init__(self, environment):
        self.name = 'DDPG'  # name for uploading results
        self.environment = environment
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.actor_network = ActorNetwork(
            state_size=environment.observation_space.shape[0],
            action_size=environment.action_space.shape[0])
        self.critic_network = CriticNetwork(
            state_size=environment.observation_space.shape[0],
            action_size=environment.action_space.shape[0])
        # initialize replay buffer
        self.replay_buffer = deque()

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(environment.action_space.shape[0])

        # Initialize time step
        self.time_step = 0
Пример #21
0
    def __init__(self, sess, env, par_idx):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.par_idx = par_idx
        self.sess = sess

        with tf.variable_scope("particle_" + str(par_idx)):
            self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                              self.action_dim, self.par_idx)
            self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                                self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
Пример #22
0
    def __init__(self):
        self.name = 'DDPG'  # name for uploading results
        # self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = 12
        self.action_dim = 10
        self.has_kicked = False
        self.laststep_haskicked = False
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)
        self.saver = tf.train.Saver(max_to_keep=1)
        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
Пример #23
0
    def __init__(self, env_name, sess, state_dim, action_dim, models_dir,
                 img_dim):
        self.name = 'DDPG'
        self.env_name = env_name
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.img_dim = img_dim
        self.models_dir = models_dir

        # Ensure action bound is symmetric
        self.time_step = 0
        self.sess = sess

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim, self.img_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim, self.img_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        self.saver = tf.train.Saver()
Пример #24
0
    def __init__(self, env,loadfilename=None,printVars=False):
        self.name = 'DDPG' # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim)
        self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim)

        #print 'init complete'
        self.all_vars = tf.global_variables()
        if printVars:
            for v in self.all_vars:
                print v.name.ljust(30), v.shape
        
        self.saver = tf.train.Saver(self.all_vars)
        if loadfilename is not None:
            self.saver.restore(self.sess, loadfilename)
Пример #25
0
    def __init__(self, state_dim, action_dim):
        """name for uploading resuults"""
        self.name = 'DDPG'
        self.time_step = 0
        # self.atten_rate = 1
        """Randomly initialize actor network and critic network"""
        """and both their target networks"""
        self.state_dim = state_dim
        self.action_dim = action_dim

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)
        """initialize replay buffer"""
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        """Initialize a random process the Ornstein-Uhlenbeck process for action exploration"""
        self.exploration_noise = OUNoise(self.action_dim)
        """Initialize a Treading"""
        self.threading = threading.Thread(target=self.train,
                                          name='LoopThread--DDPG')
Пример #26
0
    def __init__(self, env):
        # ------------------- init the (NN) & (Buf) & (explor noise) & (counter) -------------------
        self.name = 'DDPG'  # name for uploading results
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env[0]
        self.action_dim = env[1]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
        self.epsilon_max = 1.0
        self.epsilon_min = 0.01
        self.counter = 0
Пример #27
0
                       mask_inner=True,
                       mask_logits=True,
                       normalization=opts.normalization,
                       tanh_clipping=opts.tanh_clipping), opts.use_cuda)

    # Overwrite model parameters by parameters to load
    model.load_state_dict({**model.state_dict(), **load_data.get('model', {})})

    # Initialize baseline
    if opts.baseline == 'exponential':
        baseline = ExponentialBaseline(opts.exp_beta)
    elif opts.baseline == 'critic':
        baseline = CriticBaseline(
            maybe_cuda_model(
                CriticNetwork(problem.NODE_DIM, opts.embedding_dim,
                              opts.hidden_dim, opts.n_encode_layers,
                              opts.normalization), opts.use_cuda))
    elif opts.baseline == 'rollout':
        baseline = RolloutBaseline(model, problem, opts)
    else:
        assert opts.baseline is None, "Unknown baseline: {}".format(
            opts.baseline)
        baseline = NoBaseline()

    # Load baseline from data, make sure script is called with same type of baseline
    if 'baseline' in load_data:
        baseline.load_state_dict(load_data['baseline'])

    # Initialize optimizer
    optimizer = optim.Adam([{
        'params': model.parameters(),
Пример #28
0
def main():
    ''' Create the environment
    '''
    env = gym.make(ENV_NAME)

    # For tensorboard
    writer = tf.summary.FileWriter("./tensorboard")

    assert STATE_DIM == np.prod(np.array(env.observation_space.shape))
    assert ACTION_DIM == np.prod(np.array(env.action_space.shape))

    env.seed(0)
    np.random.seed(0)
    ''' Create the replay memory
    '''
    replay_memory = Memory(REPLAY_MEM_CAPACITY)

    # Tensorflow part starts here!
    tf.reset_default_graph()
    ''' Create placeholders 
    '''
    # Placeholders
    state_placeholder = tf.placeholder(dtype=tf.float32, \
                                       shape=[None, STATE_DIM],
                                       name='state_placeholder')
    action_placeholder = tf.placeholder(dtype=tf.float32, \
                                        shape=[None, ACTION_DIM],
                                        name='action_placeholder')
    reward_placeholder = tf.placeholder(dtype=tf.float32,
                                        shape=[None],
                                        name='reward_placeholder')
    next_state_placeholder = tf.placeholder(dtype=tf.float32,
                                            shape=[None, STATE_DIM],
                                            name='next_state_placeholder')
    is_not_terminal_placeholder = tf.placeholder(
        dtype=tf.float32, shape=[None], name='is_not_terminal_placeholder')

    is_training_placeholder = tf.placeholder(dtype=tf.float32,
                                             shape=(),
                                             name='is_training_placeholder')
    ''' A counter to count the number of episodes
    '''
    episodes = tf.Variable(0.0, trainable=False, name='episodes')
    episode_incr_op = episodes.assign_add(1)
    ''' Create the actor network inside the actor scope and calculate actions
    '''
    with tf.variable_scope('actor'):
        actor = ActorNetwork(STATE_DIM,
                             ACTION_DIM,
                             HIDDEN_1_ACTOR,
                             HIDDEN_2_ACTOR,
                             HIDDEN_3_ACTOR,
                             trainable=True)
        unscaled_actions = actor.call(state_placeholder)
        ''' Scale the actions to fit within the bounds provided by the 
        environment
        '''
        actions = scale_actions(unscaled_actions, env.action_space.low,
                                env.action_space.high)
    ''' Create the target actor network inside target_actor scope and calculate 
    the target actions. Apply stop_gradient to the target actions so that 
    thier gradient is not computed at any point of time.
    '''
    with tf.variable_scope('target_actor', reuse=False):
        target_actor = ActorNetwork(STATE_DIM,
                                    ACTION_DIM,
                                    HIDDEN_1_ACTOR,
                                    HIDDEN_2_ACTOR,
                                    HIDDEN_3_ACTOR,
                                    trainable=True)

        unscaled_target_actions = target_actor.call(next_state_placeholder)
        ''' Scale the actions to fit within the bounds provided by the 
        environment
        '''
        target_actions_temp = scale_actions(unscaled_target_actions,
                                            env.action_space.low,
                                            env.action_space.low)
        target_actions = tf.stop_gradient(target_actions_temp)
    ''' Create the critic network inside the critic variable scope. Get the 
    Q-values of given actions and Q-values of actions suggested by the actor 
    network.
    '''
    with tf.variable_scope('critic'):
        critic = CriticNetwork(STATE_DIM,
                               ACTION_DIM,
                               HIDDEN_1_CRITIC,
                               HIDDEN_2_CRITIC,
                               HIDDEN_3_CRITIC,
                               trainable=True)

        q_values_of_given_actions = critic.call(state_placeholder,
                                                action_placeholder)
        q_values_of_suggested_actions = critic.call(state_placeholder, actions)
    ''' Create the target critic network inside the target_critic variable 
    scope. Calculate the target Q-values and apply stop_gradient to it.
    '''
    with tf.variable_scope('target_critic', reuse=False):
        target_critic = CriticNetwork(STATE_DIM,
                                      ACTION_DIM,
                                      HIDDEN_1_CRITIC,
                                      HIDDEN_2_CRITIC,
                                      HIDDEN_3_CRITIC,
                                      trainable=True)

        target_q_values_temp = target_critic.call(next_state_placeholder,
                                                  target_actions)
        target_q_values = tf.stop_gradient(target_q_values_temp)
    ''' Calculate 
    - trainable variables in actor (Weights of actor network), 
    - Weights of target actor network
    - trainable variables in critic (Weights of critic network),
    - Weights of target critic network
    '''
    actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='actor')

    target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                          scope='target_actor')

    critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                    scope='critic')

    target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='target_critic')
    ''' Get the operators for updating the target networks. The 
    update_target_networks function defined in utils returns a list of operators 
    to be run from tf session inorder to update the target networks using 
    soft update.
    '''
    update_targets_op = update_target_networks(TAU, \
        target_actor_vars, actor_vars, target_critic_vars, \
            critic_vars)
    ''' Create the tf operation to train the critic network:
    - calculate TD-target 
    - calculate TD-Error = TD-target - q_values_of_given_actions
    - calculate Critic network's loss (Mean Squared Error of TD-Errors)
    - ?
    - create a tf operation to train the critic network
    '''
    targets = tf.expand_dims(reward_placeholder, 1) + \
        tf.expand_dims(is_not_terminal_placeholder, 1) * GAMMA * \
            target_q_values
    td_errors = targets - q_values_of_given_actions
    critic_loss = tf.reduce_mean(tf.square(td_errors))

    # Update critic networks after computing loss
    for var in critic_vars:
        if not 'bias' in var.name:
            critic_loss += L2_REG_CRITIC * 0.5 * tf.nn.l2_loss(var)

    # optimize critic
    critic_train_op = tf.train.AdamOptimizer(
        LEARNING_RATE_CRITIC * LR_DECAY**episodes).minimize(critic_loss)
    ''' Create a tf operation to train the actor networks
    - Calculate the Actor network's loss
    - Create the tf operation to train the actor network
    '''
    # Actor's loss
    actor_loss = -1 * tf.reduce_mean(q_values_of_suggested_actions)
    for var in actor_vars:
        if not 'bias' in var.name:
            actor_loss += L2_REG_ACTOR * 0.5 * tf.nn.l2_loss(var)

    # Optimize actor
    actor_train_op = tf.train.AdamOptimizer(
        LEARNING_RATE_ACTOR * LR_DECAY**episodes).minimize(actor_loss,
                                                           var_list=actor_vars)

    # Init session
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    writer.add_graph(sess.graph)

    # Training
    num_steps = 0
    for episode in range(NUM_EPISODES):
        total_reward = 0
        num_steps_in_episode = 0

        # Create noise
        noise = np.zeros(ACTION_DIM)
        noise_scale = (INITIAL_NOISE_SCALE * NOISE_DECAY ** episode) * \
            (env.action_space.high - env.action_space.low)

        # Initial state
        state = env.reset()

        for _ in range(MAX_STEPS_PER_EPISODE):

            action = sess.run(actions, feed_dict={ \
                state_placeholder: state[None],
                is_training_placeholder: False})

            # Add Noise to actions
            noise = EXPLORATION_THETA * (EXPLORATION_MU - noise) + \
                EXPLORATION_SIGMA * np.random.randn(ACTION_DIM)

            action += noise_scale * noise

            # Take action on env
            next_state, reward, done, _info = env.step(action)
            next_state = np.squeeze(next_state)
            reward = np.squeeze(reward)
            action = action[0]

            total_reward += reward

            replay_memory.add_to_memory(
                (state, action, reward, next_state, 0.0 if done else 1.0))

            if num_steps % TRAIN_EVERY == 0 and replay_memory.size() >= \
                MINI_BATCH_SIZE :
                batch = replay_memory.sample_from_memory(MINI_BATCH_SIZE)
                _, _ = sess.run([critic_train_op, actor_train_op],
                    feed_dict={
                        state_placeholder: np.asarray( \
                            [elem[0] for elem in batch]),
                        action_placeholder: np.asarray( \
                            [elem[1] for elem in batch]),
                        reward_placeholder: np.asarray( \
                            [elem[2] for elem in batch]),
                        next_state_placeholder: np.asarray( \
                            [elem[3] for elem in batch]),
                        is_not_terminal_placeholder: np.asarray( \
                            [elem[4] for elem in batch]),
                        is_training_placeholder: True
                })

                _ = sess.run(update_targets_op)

            state = next_state
            num_steps += 1
            num_steps_in_episode += 1

            if done:
                _ = sess.run(episode_incr_op)
                break

        print(str((episode, total_reward, num_steps_in_episode, noise_scale)))

    env.close()
Пример #29
0
                opts.hidden_dim,
                problem,
                n_encode_layers=opts.n_encode_layers,
                mask_inner=True,
                mask_logits=True,
                normalization=opts.normalization,
                tanh_clipping=opts.tanh_clipping), opts.use_cuda)

# Overwrite model parameters by parameters to load
model_ = get_inner_model(model)
model_.load_state_dict({**model_.state_dict(), **load_data.get('model', {})})

# Initialize baseline
baseline = CriticBaseline(
    maybe_cuda_model(
        CriticNetwork(2, opts.embedding_dim, opts.hidden_dim,
                      opts.n_encode_layers, opts.normalization),
        opts.use_cuda))
# Load baseline from data, make sure script is called with same type of baseline
if 'baseline' in load_data:
    baseline.load_state_dict(load_data['baseline'])

# Start the actual training loop
# val_dataset = problem.make_dataset(size=opts.graph_size, num_samples=opts.val_size, filename=opts.val_dataset)
# torch.save(val_dataset,'test_data/myval_20.pt')

# val_dataset = torch.load('test_data/myval_20.pt')

if opts.eval_only:
    total_cost, return_return, best = validate(model, val_dataset, opts)

    print('Improving: {} +- {}'.format(
Пример #30
0
    def __init__(self, input_dims, n_actions, env,
                 fc1_dims, fc2_dims, alpha, beta,
                 gamma, tau, noise1, noise2, clamp,
                 delay, max_size, batch_size, warmup):

        self.gamma = gamma
        self.tau = tau
        self.noise1 = noise1
        self.noise2 = noise2
        self.clamp = clamp
        self.delay = delay
        self.batch_size = batch_size
        self.warmup = warmup
        self.learn_cntr = 0
        self.env = env
        self.n_actions = n_actions

        self.actor = ActorNetwork(
                     input_shape=input_dims,
                     n_actions=n_actions,
                     fc1_dims=fc1_dims,
                     fc2_dims=fc2_dims,
                     alpha=alpha,
                     name='Actor_TD3PG.cpt',
                     checkpoint_dir='tmp/models')

        self.critic_1 = CriticNetwork(
                        input_shape=input_dims,
                        n_actions=n_actions,
                        fc1_dims=fc1_dims,
                        fc2_dims=fc2_dims,
                        beta=beta,
                        name='Critic_1_TD3PG.cpt',
                        checkpoint_dir='tmp/models')

        self.critic_2 = CriticNetwork(
                        input_shape=input_dims,
                        n_actions=n_actions,
                        fc1_dims=fc1_dims,
                        fc2_dims=fc2_dims,
                        beta=beta,
                        name='Critic_2_TD3PG.cpt',
                        checkpoint_dir='tmp/models')

        self.target_actor = ActorNetwork(
                            input_shape=input_dims,
                            n_actions=n_actions,
                            fc1_dims=fc1_dims,
                            fc2_dims=fc2_dims,
                            alpha=alpha,
                            name='Target_Actor_TD3PG.cpt',
                            checkpoint_dir='tmp/models')

        self.target_critic_1 = CriticNetwork(
                               input_shape=input_dims,
                               n_actions=n_actions,
                               fc1_dims=fc1_dims,
                               fc2_dims=fc2_dims,
                               beta=beta,
                               name='Target_Critic_1_TD3PG.cpt',
                               checkpoint_dir='tmp/models')

        self.target_critic_2 = CriticNetwork(
                               input_shape=input_dims, 
                               n_actions=n_actions, 
                               fc1_dims=fc1_dims,
                               fc2_dims=fc2_dims, 
                               beta=beta, 
                               name='Target_Critic_2_TD3PG.cpt',
                               checkpoint_dir='tmp/models')

        self.memory = ReplayBuffer(
                      max_size=max_size, 
                      input_shape=input_dims, 
                      n_actions=n_actions)

        self.update_target_networks()