示例#1
0
    def __init__(self, env, replay_buffer, sample_batch, train_iter, gamma,
                 tau, batch_size, n_train, n_episode):
        # Gym environment
        self.env = env

        env_flattened = gym.wrappers.FlattenDictWrapper(
            env, dict_keys=['observation', 'achieved_goal', 'desired_goal'])

        # Get space sizes
        self.state_dim = env_flattened.observation_space.shape[0]
        #self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]

        # Get replay buffer and function get a batch from it
        self.replay_buffer = replay_buffer
        self.sample_batch = sample_batch

        self.sess = tf.InteractiveSession()

        # Hyper parameters
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.n_train = n_train
        self.n_episode = n_episode

        # Initialize networks
        self.critic = CriticNetwork(self.sess, self.state_dim, self.action_dim)
        self.actor = ActorNetwork(self.sess, self.state_dim, self.action_dim)

        self.exploration_noise = OUNoise(self.action_dim)
    def __init__(self, sess, number, model_path, global_episodes, explore,
                 decay, training):
        self.name = 'worker_' + str(number)  # name for uploading results
        self.number = number
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = 41
        self.action_dim = 18
        self.model_path = model_path
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.sess = sess
        self.explore = explore
        self.decay = decay
        self.training = training

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim,
                                          self.name + '/actor')
        self.actor_network.update_target(self.sess)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim,
                                            self.name + '/critic')
        self.critic_network.update_target(self.sess)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        self.update_local_ops_actor = update_target_graph(
            'global/actor', self.name + '/actor')
        self.update_local_ops_critic = update_target_graph(
            'global/critic', self.name + '/critic')
示例#3
0
    def __init__(self, env, DIRECTORY):
        self.batch_size = BATCH_SIZE
        self.replay_start_size = REPLAY_START_SIZE  # self.sub_batch_size = BATCH_SIZE / n_gpu

        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession(config=tf.ConfigProto(
            allow_soft_placement=True, log_device_placement=False))

        self.trace_length = TRACE_LENGTH
        self.temp_abstract = TEMP_ABSTRACT
        self.actor_network = ActorNetwork(self.sess, BATCH_SIZE,
                                          self.state_dim, self.action_dim,
                                          self.temp_abstract, DIRECTORY)
        self.critic_network = CriticNetwork(self.sess, BATCH_SIZE,
                                            self.state_dim, self.action_dim,
                                            self.temp_abstract, DIRECTORY)

        # initialize replay buffer
        max_len_trajectory = self.environment.spec.timestep_limit + 1  # trace_length
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE, DIRECTORY,
                                          max_len_trajectory,
                                          self.actor_network.last_epi)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        ###
        self.diff = 0.
        self.discounting_mat_dict = {}
示例#4
0
    def __init__(self, env_name, state_dim, action_dim):
        self.name = 'DDPG'  # name for uploading results
        self.env_name = env_name
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = state_dim
        self.action_dim = action_dim

        # Ensure action bound is symmetric
        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.OU = OU()

        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(save_location)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print("Successfully loaded:", checkpoint.model_checkpoint_path)
        else:
            print("Could not find old network weights")
    def __init__(self,
                 input_dim,
                 action_dim,
                 critic_layers,
                 actor_layers,
                 actor_activation,
                 scope='ac_network'):

        self.input_dim = input_dim
        self.action_dim = action_dim
        self.scope = scope

        self.x = tf.placeholder(shape=(None, input_dim),
                                dtype=tf.float32,
                                name='x')
        self.y = tf.placeholder(shape=(None, ), dtype=tf.float32, name='y')

        with tf.variable_scope(scope):
            self.actor_network = ActorNetwork(self.x,
                                              action_dim,
                                              hidden_layers=actor_layers,
                                              activation=actor_activation)

            self.critic_network = CriticNetwork(
                self.x,
                self.actor_network.get_output_layer(),
                hidden_layers=critic_layers)

            self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                          tf.get_variable_scope().name)
            self._build()
示例#6
0
    def __init__(self, state_size, action_size, num_agents):
        """ Initialize agent.

        Params
        ======
        state_size (integer): Size of input state vector
        action_size (integer): Size of action vector
        num_agents (integer): Number of simultaneous agents in the environment
        """

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents

        # Actor
        self.local_actor_network = ActorNetwork(state_size, action_size)
        self.target_actor_network = ActorNetwork(state_size, action_size)
        self.actor_optimizer = optim.Adam(
            self.local_actor_network.parameters(), lr=ACTOR_LEARNING_RATE)

        # Critic
        self.local_critic_network = CriticNetwork(state_size, action_size)
        self.target_critic_network = CriticNetwork(state_size, action_size)
        self.critic_optimizer = optim.Adam(
            self.local_critic_network.parameters(),
            lr=CRITIC_LEARNING_RATE,
            weight_decay=CRITIC_WEIGHT_DECAY)

        self.replay_buffer = ReplayBuffer(action_size, REPLAY_BUFFER_SIZE,
                                          None)
        self.steps = 0
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.random_process = OrnsteinUhlenbeckProcess(
            (num_agents, action_size), sigma=RANDOM_SIGMA, theta=RANDOM_THETA)
示例#7
0
文件: ddpg.py 项目: ivychill/ltr
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        # self.state_dim = env.observation_space.shape[0] * 2
        self.action_dim = env.action_space.shape[0]

        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        # self.exploration_noise = OUNoise(self.action_dim)
        self.exploration_noise = OUNoise()
        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(MODEL_PATH)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            my_config.logger.warn("Successfully loaded: %s" %
                                  (checkpoint.model_checkpoint_path))
        else:
            my_config.logger.error("Could not find old network weights")
示例#8
0
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        self.epsilon_expert_range = (1.0, 0.1)
        self.epsilon_expert = self.epsilon_expert_range[0]
        self.epsilon_random_range = (0.1, 0.01)
        self.epsilon_random = self.epsilon_random_range[0]
        # Randomly initialize actor network and critic network
        # with both their target networks
        # self.state_dim = env.observation_space.shape[0]
        self.state_dim = 16
        # self.action_dim = env.action_space.shape[0]
        self.action_dim = 3
        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        # self.exploration_noise = OUNoise(self.action_dim)
        # self.exploration_noise = OUNoise()
        self.OU = OU()
        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(MODEL_PATH)
        if checkpoint and checkpoint.model_checkpoint_path:
            path = checkpoint.model_checkpoint_path
            self.saver.restore(self.sess, path)
            self.time_step = int(path[path.rindex('-') + 1:])
            self.epsilon_expert -= (
                self.epsilon_expert_range[0] -
                self.epsilon_expert_range[1]) * self.time_step / EXPLORE_COUNT
            self.epsilon_expert = max(self.epsilon_expert,
                                      self.epsilon_expert_range[1])
            self.epsilon_random -= (
                self.epsilon_random_range[0] -
                self.epsilon_random_range[1]) * self.time_step / EXPLORE_COUNT
            self.epsilon_random = max(self.epsilon_random,
                                      self.epsilon_random_range[1])
            logger.warn(
                "Successfully loaded: %s, step: %d, epsilon_expert: %s, epsilon_random: %s"
                % (path, self.time_step, self.epsilon_expert,
                   self.epsilon_random))
        else:
            logger.warn("Could not find old network weights")

        self.critic_cost = 0
示例#9
0
    def __init__(self, state_size, action_size, num_agents):
        """ Initialize agent.

        Params
        ======
        state_size (integer): Size of input state vector
        action_size (integer): Size of action vector
        num_agents (integer): Number of simultaneous agents in the environment
        """

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents

        # Actor
        self.actor = ActorNetwork(state_size, action_size)
        self.actor_target = ActorNetwork(state_size, action_size)
        self.soft_update(self.actor_target.parameters(),
                         self.actor.parameters(), 1)
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=ACTOR_LEARNING_RATE)

        # Create one critic per agent
        self.critics = []
        self.critic_targets = []
        self.critic_optimizers = []
        for i in range(num_agents):

            # Critic
            # Note: we use action_size * num_agents since we'll pass in the actions of all agents concatenated
            critic = CriticNetwork(state_size * num_agents,
                                   action_size * num_agents)
            self.critics.append(critic)
            self.critic_targets.append(
                CriticNetwork(state_size * num_agents,
                              action_size * num_agents))
            self.soft_update(self.critic_targets[-1].parameters(),
                             critic.parameters(), 1)
            self.critic_optimizers.append(
                optim.Adam(critic.parameters(),
                           lr=CRITIC_LEARNING_RATE,
                           weight_decay=CRITIC_WEIGHT_DECAY))

        self.replay_buffer = ReplayBuffer(action_size, REPLAY_BUFFER_SIZE,
                                          None)
        self.steps = 0
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.random_process = OrnsteinUhlenbeckProcess((1, action_size),
                                                       sigma=RANDOM_SIGMA,
                                                       theta=RANDOM_THETA)
示例#10
0
    def __init__(self, track_name='practgt2.xml'):
        BUFFER_SIZE = 100000
        TAU = 0.001  # Target Network HyperParameters
        LRA = 0.0001  # Learning rate for Actor
        LRC = 0.001  # Lerning rate for Critic
        state_dim = 29  # of sensors input
        self.batch_size = 32
        self.lambda_mix = 10.0
        self.action_dim = 3  # Steering/Acceleration/Brake

        # Tensorflow GPU optimization
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        from keras import backend as K
        K.set_session(sess)

        self.actor = ActorNetwork(sess, state_dim, self.action_dim,
                                  self.batch_size, TAU, LRA)
        self.critic = CriticNetwork(sess, state_dim, self.action_dim,
                                    self.batch_size, TAU, LRC)
        self.buff = ReplayBuffer(BUFFER_SIZE)  # Create replay buffer
        self.track_name = track_name

        self.save = dict(total_reward=[],
                         total_step=[],
                         ave_reward=[],
                         distRaced=[],
                         distFromStart=[],
                         lastLapTime=[],
                         curLapTime=[],
                         lapTimes=[],
                         avelapTime=[],
                         ave_sp=[],
                         max_sp=[],
                         min_sp=[],
                         test_total_reward=[],
                         test_total_step=[],
                         test_ave_reward=[],
                         test_distRaced=[],
                         test_distFromStart=[],
                         test_lastLapTime=[],
                         test_curLapTime=[],
                         test_lapTimes=[],
                         test_avelapTime=[],
                         test_ave_sp=[],
                         test_max_sp=[],
                         test_min_sp=[])
示例#11
0
	def __init__(self, env):
		self.sess = tf.InteractiveSession()
		#self.params = loadparams() # ???
		self.env = env
		self.n_states = env.observation_space.shape[0]
		self.n_actions = env.action_space.shape[0]
		self.low = self.env.action_space.low
		self.high = self.env.action_space.high
		self.actor_network = ActorNetwork(self.sess, self.n_states, self.n_actions)
		self.trainable_var_count = self.actor_network.get_trainable_var_count()
		self.critic_network = CriticNetwork(self.sess, self.n_states, self.n_actions, \
			self.actor_network, self.trainable_var_count)
		self.replay_buffer = ReplayBuffer(BUFFER_SIZE) #params['buffer_size']???
		self.exploration_noise = OUNoise(self.n_actions)
		# self.noise = Noise()
		self.gamma = GAMMA
		self.sess.run(tf.global_variables_initializer())
示例#12
0
 def create_multi_agents(self, sess, num_agents, state_dim, action_dim):
     agents = []
     nets = None
     for ii in range(num_agents):
         agent_name = 'agent' + str(ii)
         agents.append(
             ActorNetwork(sess, state_dim, action_dim, agent_name, nets))
         nets = agents[-1].nets
     return agents
示例#13
0
    def __init__(self, state_dim, action_dim):
        self.name = 'DDPG'  # name for uploading results
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = state_dim
        self.action_dim = action_dim

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
示例#14
0
    def __init__(self, env, state_dim, action_dim):
        self.name = 'DDPG'
        self.environment = env
        self.time_step = 0
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.linear_noise = OUNoise(1, 0.5, 0.3, 0.6)
        self.angular_noise = OUNoise(1, 0, 0.6, 0.8)
示例#15
0
    def __init__(self, environment):
        self.name = 'DDPG'  # name for uploading results
        self.environment = environment
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.actor_network = ActorNetwork(
            state_size=environment.observation_space.shape[0],
            action_size=environment.action_space.shape[0])
        self.critic_network = CriticNetwork(
            state_size=environment.observation_space.shape[0],
            action_size=environment.action_space.shape[0])
        # initialize replay buffer
        self.replay_buffer = deque()

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(environment.action_space.shape[0])

        # Initialize time step
        self.time_step = 0
示例#16
0
文件: ddpg.py 项目: ruizhao13/HFO
    def __init__(self):
        self.name = 'DDPG'  # name for uploading results
        # self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = 12
        self.action_dim = 10
        self.has_kicked = False
        self.laststep_haskicked = False
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)
        self.saver = tf.train.Saver(max_to_keep=1)
        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
示例#17
0
    def __init__(self, state_dim, state_channel, action_dim):
        self.state_dim = state_dim
        self.state_channel = state_channel
        self.action_dim = action_dim

        self.sess = tf.InteractiveSession()
        self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.target_state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.action_input = tf.placeholder('float', [None, action_dim])

        self.actor_network = ActorNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)

        # create network
        self.actor_network.create_network(self.state_input)
        self.critic_network.create_q_network(self.state_input, self.actor_network.action_output)

        # create target network
        self.actor_network.create_target_network(self.target_state_input)
        self.critic_network.create_target_q_network(self.target_state_input, self.actor_network.target_action_output)

        # create training method
        self.actor_network.create_training_method(self.critic_network.q_value_output)
        self.critic_network.create_training_method()

        self.sess.run(tf.initialize_all_variables())
        self.actor_network.update_target()
        self.critic_network.update_target()

        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        self.exploration_noise = OUNoise(self.action_dim)

        self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/models_ddpg'
        if not os.path.exists(self.dir_path):
            os.mkdir(self.dir_path)

        # for log
        self.reward_input = tf.placeholder(tf.float32)
        tf.scalar_summary('reward', self.reward_input)
        self.time_input = tf.placeholder(tf.float32)
        tf.scalar_summary('living_time', self.time_input)
        self.summary_op = tf.merge_all_summaries()
        self.summary_writer = tf.train.SummaryWriter(self.dir_path + '/log', self.sess.graph)

        self.episode_reward = 0.0
        self.episode_start_time = 0.0

        self.time_step = 1
        self.saver = tf.train.Saver(tf.all_variables())
        self.load_time_step()
        self.load_network()
        return
示例#18
0
    def __init__(self, env_name, sess, state_dim, action_dim, models_dir,
                 img_dim):
        self.name = 'DDPG'
        self.env_name = env_name
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.img_dim = img_dim
        self.models_dir = models_dir

        # Ensure action bound is symmetric
        self.time_step = 0
        self.sess = sess

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim, self.img_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim, self.img_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        self.saver = tf.train.Saver()
示例#19
0
 def __init__(self):
     self._init_setup()
     self.viewer = None
     self.action_space = spaces.Box(self.act_low, self.act_high)
     self.observation_space = spaces.Box(self.obs_low, self.obs_high)
     self._seed()
     self._reset()
     self.dt = 0.01
     self.sess = tf.InteractiveSession()
     self.actor_network = ActorNetwork(self.sess,
                                       self.observation_space.shape[0],
                                       self.action_space.shape[0])
     self.goal_state = np.zeros(shape=3)
示例#20
0
    def __init__(self, env, results_file):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        results_file.write(ActorNetwork.get_settings())
示例#21
0
    def __init__(self, env):
        # ------------------- init the (NN) & (Buf) & (explor noise) & (counter) -------------------
        self.name = 'DDPG'  # name for uploading results
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env[0]
        self.action_dim = env[1]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
        self.epsilon_max = 1.0
        self.epsilon_min = 0.01
        self.counter = 0
示例#22
0
    def __init__(self, env,loadfilename=None,printVars=False):
        self.name = 'DDPG' # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim)
        self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim)

        #print 'init complete'
        self.all_vars = tf.global_variables()
        if printVars:
            for v in self.all_vars:
                print v.name.ljust(30), v.shape
        
        self.saver = tf.train.Saver(self.all_vars)
        if loadfilename is not None:
            self.saver.restore(self.sess, loadfilename)
示例#23
0
文件: ddpg.py 项目: Ivehui/DDPG
    def __init__(self, environment):
        self.name = 'DDPG' # name for uploading results
        self.environment = environment
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.actor_network = ActorNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0])
        self.critic_network = CriticNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0])
        # initialize replay buffer
        self.replay_buffer = deque()

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(environment.action_space.shape[0])

        # Initialize time step
        self.time_step = 0
示例#24
0
    def add_agents(self, add_num):
        for ii in range(add_num):
            #self.num_agents+=1

            agent_name = 'agent' + str(self.num_agents)
            self.agents.append(
                ActorNetwork(self.sess, self.state_dim, self.action_dim,
                             agent_name, self.agents[-1].nets))
            # the agents' name is from 0-num_agents-1
            self.num_agents += 1

        # if add a new agent then reset the noise and replay buffer
        self.exploration_noise = OUNoise((self.num_agents, self.action_dim))
        #self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        self.replay_buffer.erase()
        # re-create a saver
        # the new saver will contains all the savable variables.
        # otherwise only contains the initially created agents
        self.saver = tf.train.Saver()
示例#25
0
    def main(self):
        np.random.seed(0)
        replay_memory = deque(maxlen=REPLAY_MEM_CAPACITY)

        def add_to_memory(experience):
            replay_memory.append(experience)

        def sample_from_memory(minibatch_size):
            return random.sample(replay_memory, minibatch_size)

        tf.reset_default_graph()

        # placeholders
        state_placeholder = tf.placeholder(dtype=tf.float32,
                                           shape=[None, STATE_DIM])
        action_placeholder = tf.placeholder(dtype=tf.float32,
                                            shape=[None, ACTION_DIM])
        reward_placeholder = tf.placeholder(dtype=tf.float32, shape=[None])
        next_state_placeholder = tf.placeholder(dtype=tf.float32,
                                                shape=[None, STATE_DIM])
        # indicators (go into target computation)
        is_not_terminal_placeholder = tf.placeholder(dtype=tf.float32,
                                                     shape=[None])
        is_training_placeholder = tf.placeholder(dtype=tf.bool,
                                                 shape=())  # for dropout

        # episode counter
        episodes = tf.Variable(0.0, trainable=False, name='episodes')
        episode_incr_op = episodes.assign_add(1)

        # actor network
        with tf.variable_scope('actor'):
            actor = Actor(STATE_DIM,
                          ACTION_DIM,
                          HIDDEN_1_ACTOR,
                          HIDDEN_2_ACTOR,
                          HIDDEN_3_ACTOR,
                          trainable=True)
            '''              
            Policy's outputted action for each state_ph (for generating 
            actions and training the critic)
            '''
            actions_unscaled = actor.call(state_placeholder)
            actions = MIN_BANDWIDTH + tf.nn.sigmoid(actions_unscaled) * (
                MAX_BANDWIDTH - MIN_BANDWIDTH)

        # slow target actor network
        with tf.variable_scope('target_actor', reuse=False):
            target_actor = Actor(STATE_DIM,
                                 ACTION_DIM,
                                 HIDDEN_1_ACTOR,
                                 HIDDEN_2_ACTOR,
                                 HIDDEN_3_ACTOR,
                                 trainable=True)
            '''
            Slow target policy's outputted action for each next_state_ph 
            (for training the critic)
            use stop_gradient to treat the output values as constant targets 
            when doing backprop
            '''
            target_next_actions_unscaled = target_actor.call(
                next_state_placeholder)
            target_next_actions_1 = MIN_BANDWIDTH + tf.nn.sigmoid(\
                target_next_actions_unscaled)*(MAX_BANDWIDTH - MIN_BANDWIDTH)
            target_next_actions = tf.stop_gradient(target_next_actions_1)

        with tf.variable_scope('critic') as scope:
            critic = Critic(STATE_DIM,
                            ACTION_DIM,
                            HIDDEN_1_CRITIC,
                            HIDDEN_2_CRITIC,
                            HIDDEN_3_CRITIC,
                            trainable=True)
            # Critic applied to state_ph and a given action(for training critic)
            q_values_of_given_actions = critic.call(state_placeholder,
                                                    action_placeholder)
            '''
            Critic applied to state_ph and the current policy's outputted 
            actions for state_ph (for training actor via deterministic 
            policy gradient)
            '''
            q_values_of_suggested_actions = critic.call(
                state_placeholder, actions)

        # slow target critic network
        with tf.variable_scope('target_critic', reuse=False):
            target_critic = Critic(STATE_DIM, ACTION_DIM, HIDDEN_1_CRITIC,
                                   HIDDEN_2_CRITIC, HIDDEN_3_CRITIC, \
                                       trainable=True)
            '''
            Slow target critic applied to slow target actor's outputted 
            actions for next_state_ph (for training critic)
            '''
            q_values_next = tf.stop_gradient(
                target_critic.call(next_state_placeholder,
                                   target_next_actions))

        # isolate vars for each network
        actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       scope='actor')
        target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                              scope='target_actor')
        critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                        scope='critic')
        target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                               scope='target_critic')

        # update slowly-changing targets towards current actor and critic
        update_target_ops = []
        for i, target_actor_var in enumerate(target_actor_vars):
            update_target_actor_op = target_actor_var.assign(
                TAU * actor_vars[i] + (1 - TAU) * target_actor_var)
            update_target_ops.append(update_target_actor_op)

        for i, target_var in enumerate(target_critic_vars):
            target_critic_op = target_var.assign(TAU * critic_vars[i] +
                                                 (1 - TAU) * target_var)
            update_target_ops.append(target_critic_op)

        update_targets_op = tf.group(*update_target_ops,
                                     name='update_slow_targets')
        '''
        # One step TD targets y_i for (s,a) from experience replay
        # = r_i + gamma*Q_slow(s',mu_slow(s')) if s' is not terminal
        # = r_i if s' terminal
        '''
        targets = tf.expand_dims(
            reward_placeholder, 1) + tf.expand_dims(is_not_terminal_placeholder,\
                 1) * GAMMA * q_values_next

        # 1-step temporal difference errors
        td_errors = targets - q_values_of_given_actions

        # critic loss function (mean-square value error with regularization)
        critic_loss = tf.reduce_mean(tf.square(td_errors))
        for var in critic_vars:
            if not 'bias' in var.name:
                critic_loss += L2_REG_CRITIC * 0.5 * tf.nn.l2_loss(var)

        # critic optimizer
        critic_train_op = tf.train.AdamOptimizer(
            LEARNING_RATE_CRITIC * LR_DECAY**episodes).minimize(critic_loss)

        # actor loss function (mean Q-values under current policy with
        # regularization)
        actor_loss = -1 * tf.reduce_mean(q_values_of_suggested_actions)
        for var in actor_vars:
            if not 'bias' in var.name:
                actor_loss += L2_REG_ACTOR * 0.5 * tf.nn.l2_loss(var)
        '''
        actor optimizer
        the gradient of the mean Q-values wrt actor params is the 
        deterministic policy gradient (keeping critic params fixed)
        '''

        actor_train_op = tf.train.AdamOptimizer(
            LEARNING_RATE_ACTOR*LR_DECAY**episodes).minimize(actor_loss, \
                var_list=actor_vars)

        # initialize session
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
        # print(sess.run(tf.report_uninitialized_variables()))

        ## Training

        num_steps = 0
        for episode in range(NUM_EPISODES):
            total_reward = 0
            num_steps_in_episode = 0

            # Create noise
            noise = np.zeros(ACTION_DIM)
            noise_scale = (INITIAL_NOISE_SCALE * NOISE_DECAY ** episode) * \
                (MAX_BANDWIDTH - MIN_BANDWIDTH)  # TODO: uses env

            # Initial state
            self.reset()  # TODO: uses env
            state = self.input_state

            for t in range(MAX_STEPS_PER_EPISODE):
                # choose action based on deterministic policy
                state = np.asarray(state)
                state = state.reshape(1, state.shape[0])
                action, = sess.run(actions,
                                   feed_dict={state_placeholder: state, \
                                       is_training_placeholder: False})

                # add temporally-correlated exploration noise to action
                # (using an Ornstein-Uhlenbeck process)
                noise = EXPLORATION_THETA * \
                    (EXPLORATION_MU - noise) + \
                    EXPLORATION_SIGMA*np.random.randn(ACTION_DIM)
                action += noise_scale * noise

                # take step
                next_state, reward, done, = self.step(action)
                total_reward += reward

                add_to_memory((
                    state,
                    action,
                    reward,
                    next_state,
                    #    is next_observation a terminal state?
                    #    0.0 if done and not env.env._past_limit() else
                    #    1.0))
                    0.0 if done else 1.0))

                # update network weights to fit a minibatch of experience
                if num_steps % TRAIN_EVERY == 0 and \
                    len(replay_memory) >= MINI_BATCH_SIZE:

                    minibatch = sample_from_memory(MINI_BATCH_SIZE)
                    '''
                    update the critic and actor params using mean-square value 
                    error and deterministic policy gradient, respectively
                    '''
                    _, _ = sess.run([critic_train_op, actor_train_op],
                                    feed_dict={
                        state_placeholder: np.asarray([elem[0] for elem in \
                            minibatch]),
                        action_placeholder: np.asarray([elem[1] for elem in \
                            minibatch]),
                        reward_placeholder: np.asarray([elem[2] for elem in \
                            minibatch]),
                        next_state_placeholder: np.asarray([elem[3] for elem in\
                             minibatch]),
                        is_not_terminal_placeholder: np.asarray([elem[4] for \
                            elem in minibatch]),

                        is_training_placeholder: True})
                    '''
                    update slow actor and critic targets towards current actor 
                    and critic
                    '''
                    _ = sess.run(update_targets_op)

                state = next_state
                num_steps += 1
                num_steps_in_episode += 1

                if done:
                    # Increment episode counter
                    _ = sess.run(episode_incr_op)
                    break

            print('Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: \
                %7.3f'                       % (episode, total_reward, num_steps_in_episode, \
                    noise_scale))
示例#26
0
class DDPGAgent():
    """
    Deep deterministic policy gradient agent as described in
    https://arxiv.org/abs/1509.02971.

    This agent is meant to operate on low dimensional inputs, not raw pixels.

    To use the agent, you can get action predictions using act(), and to teach
    the agent, feed the results to learn.
    """
    def __init__(self, state_size, action_size, num_agents):
        """ Initialize agent.

        Params
        ======
        state_size (integer): Size of input state vector
        action_size (integer): Size of action vector
        num_agents (integer): Number of simultaneous agents in the environment
        """

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents

        # Actor
        self.local_actor_network = ActorNetwork(state_size, action_size)
        self.target_actor_network = ActorNetwork(state_size, action_size)
        self.actor_optimizer = optim.Adam(
            self.local_actor_network.parameters(), lr=ACTOR_LEARNING_RATE)

        # Critic
        self.local_critic_network = CriticNetwork(state_size, action_size)
        self.target_critic_network = CriticNetwork(state_size, action_size)
        self.critic_optimizer = optim.Adam(
            self.local_critic_network.parameters(),
            lr=CRITIC_LEARNING_RATE,
            weight_decay=CRITIC_WEIGHT_DECAY)

        self.replay_buffer = ReplayBuffer(action_size, REPLAY_BUFFER_SIZE,
                                          None)
        self.steps = 0
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.random_process = OrnsteinUhlenbeckProcess(
            (num_agents, action_size), sigma=RANDOM_SIGMA, theta=RANDOM_THETA)

    def act(self, states, noise=True):
        """
        Returns an action vector based on the current game state.

        Params
        ======
        states (array_like): A matrix of game states (each row represents the
            state of an agent)
        noise (boolean): Add random noise to the predicted action.  Aids
            exploration of the environment during training.
        """

        self.local_actor_network.eval()
        with torch.no_grad():
            actions = self.local_actor_network(
                torch.tensor(states, dtype=torch.float32)).detach().numpy()
        self.local_actor_network.train()
        if noise:
            actions = actions + self.random_process.sample()
        actions = np.clip(actions, -1, 1)
        return actions

    def vectorize_experiences(self, experiences):
        """Vectorizes experience objects for use by pytorch

        Params
        ======
            experiences (array_like of Experience objects): Experiences to
                vectorize
        """
        states = torch.from_numpy(
            np.vstack([e.state for e in experiences
                       if e is not None])).float().to(self.device)
        actions = torch.from_numpy(
            np.vstack([e.action for e in experiences
                       if e is not None])).float().to(self.device)
        rewards = torch.from_numpy(
            np.vstack([e.reward for e in experiences
                       if e is not None])).float().to(self.device)
        next_states = torch.from_numpy(
            np.vstack([e.next_state for e in experiences
                       if e is not None])).float().to(self.device)
        dones = torch.from_numpy(
            np.vstack([e.done for e in experiences if e is not None
                       ]).astype(np.uint8)).float().to(self.device)

        return (states, actions, rewards, next_states, dones)

    def normalize(self, to_normalize):
        """
        Normalize the each row of the input along the 0 dimension using the
        formula (value - mean)/std

        Params
        ======
        to_normalize (array_like): Values to normalize
        """

        std = to_normalize.std(0)
        mean = to_normalize.mean(0)
        return (to_normalize - mean) / (std + 1e-5)

    def soft_update(self, target_parameters, local_parameters):
        """
        Updates the given target network parameters with the local parameters
        using a soft update strategy: tau * local + (1-tau) * target
        """

        for target, local in zip(target_parameters, local_parameters):
            target.data.copy_(TAU * local.data + (1.0 - TAU) * target.data)

    def train(self, experiences):
        """
        Trains the actor and critic networks using a minibatch of experiences

        Params
        ======
        experiences (array_like of Experience): Minibatch of experiences
        """
        states, actions, rewards, next_states, dones = self.vectorize_experiences(
            experiences)
        #states = self.normalize(states)
        #next_states = self.normalize(next_states)
        rewards = self.normalize(rewards)

        # Use the target critic network to calculate a target q value
        next_actions = self.target_actor_network(next_states)
        q_target = rewards + GAMMA * self.target_critic_network(
            next_states, next_actions) * (1 - dones)

        # Calculate the predicted q value
        q_predicted = self.local_critic_network(states, actions)

        # Update critic network
        critic_loss = F.mse_loss(q_predicted, q_target)
        #print(critic_loss)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.local_critic_network.parameters(),
                                       1)
        self.critic_optimizer.step()

        # Update predicted action using policy gradient
        actions_predicted = self.local_actor_network(states)
        #print(self.local_critic_network(states, actions_predicted).mean())
        policy_loss = -self.local_critic_network(states,
                                                 actions_predicted).mean()
        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        #print(policy_loss)
        self.actor_optimizer.step()

        self.soft_update(self.target_actor_network.parameters(),
                         self.local_actor_network.parameters())
        self.soft_update(self.target_critic_network.parameters(),
                         self.local_critic_network.parameters())

    def learn(self, experience):
        """
        Tells the agent to learn from an experience.  This may not immediately
        result in training since this agent uses a replay buffer.

        Params
        ======
        experience (Experience): An experience used to teach the agent.
        """
        self.replay_buffer.add(experience)
        self.steps += 1
        if self.steps % STEPS_BETWEEN_TRAINING == 0 and len(
                self.replay_buffer) >= BATCH_SIZE:
            for i in range(ITERATIONS_PER_TRAINING):
                self.train(self.replay_buffer.sample(BATCH_SIZE))

    def save(self, filename):
        """Saves learned params of underlying networks to a checkpoint file.

        Params
        ======
            filename (string): Target file.  agent- and critic- are prepended
                for the agent and critic network, respectively
        """
        torch.save(self.local_actor_network.state_dict(), "actor-" + filename)
        torch.save(self.local_critic_network.state_dict(),
                   "critic-" + filename)

    def load(self, filename):
        """Loads learned params generated by save() into underlying networks.

            filename (string): Path to file.  There should be an agent- and
            critic- version of this file.
        """
        self.local_actor_network.load_state_dict(
            torch.load("actor-" + filename))
        self.target_actor_network.load_state_dict(
            torch.load("actor-" + filename))

        self.local_critic_network.load_state_dict(
            torch.load("critic-" + filename))
        self.target_critic_network.load_state_dict(
            torch.load("critic-" + filename))

    def end_episode(self):
        """
        Tell the agent that an episode is complete.
        """
        self.random_process.reset()
        self.steps = 0
示例#27
0
class Agent:
    def __init__(self, input_dims, n_actions, env,
                 fc1_dims, fc2_dims, alpha, beta,
                 gamma, tau, noise1, noise2, clamp,
                 delay, max_size, batch_size, warmup):

        self.gamma = gamma
        self.tau = tau
        self.noise1 = noise1
        self.noise2 = noise2
        self.clamp = clamp
        self.delay = delay
        self.batch_size = batch_size
        self.warmup = warmup
        self.learn_cntr = 0
        self.env = env
        self.n_actions = n_actions

        self.actor = ActorNetwork(
                     input_shape=input_dims,
                     n_actions=n_actions,
                     fc1_dims=fc1_dims,
                     fc2_dims=fc2_dims,
                     alpha=alpha,
                     name='Actor_TD3PG.cpt',
                     checkpoint_dir='tmp/models')

        self.critic_1 = CriticNetwork(
                        input_shape=input_dims,
                        n_actions=n_actions,
                        fc1_dims=fc1_dims,
                        fc2_dims=fc2_dims,
                        beta=beta,
                        name='Critic_1_TD3PG.cpt',
                        checkpoint_dir='tmp/models')

        self.critic_2 = CriticNetwork(
                        input_shape=input_dims,
                        n_actions=n_actions,
                        fc1_dims=fc1_dims,
                        fc2_dims=fc2_dims,
                        beta=beta,
                        name='Critic_2_TD3PG.cpt',
                        checkpoint_dir='tmp/models')

        self.target_actor = ActorNetwork(
                            input_shape=input_dims,
                            n_actions=n_actions,
                            fc1_dims=fc1_dims,
                            fc2_dims=fc2_dims,
                            alpha=alpha,
                            name='Target_Actor_TD3PG.cpt',
                            checkpoint_dir='tmp/models')

        self.target_critic_1 = CriticNetwork(
                               input_shape=input_dims,
                               n_actions=n_actions,
                               fc1_dims=fc1_dims,
                               fc2_dims=fc2_dims,
                               beta=beta,
                               name='Target_Critic_1_TD3PG.cpt',
                               checkpoint_dir='tmp/models')

        self.target_critic_2 = CriticNetwork(
                               input_shape=input_dims, 
                               n_actions=n_actions, 
                               fc1_dims=fc1_dims,
                               fc2_dims=fc2_dims, 
                               beta=beta, 
                               name='Target_Critic_2_TD3PG.cpt',
                               checkpoint_dir='tmp/models')

        self.memory = ReplayBuffer(
                      max_size=max_size, 
                      input_shape=input_dims, 
                      n_actions=n_actions)

        self.update_target_networks()

    def update_target_networks(self):
        tau = self.tau

        actor = dict(self.actor.named_parameters())
        critic_1 = dict(self.critic_1.named_parameters())
        critic_2 = dict(self.critic_2.named_parameters())
        target_actor = dict(self.target_actor.named_parameters())
        target_critic_1 = dict(self.target_critic_1.named_parameters())
        target_critic_2 = dict(self.target_critic_2.named_parameters())
        
        for name in actor:
            actor[name] = tau*actor[name].clone() + (1-tau)*target_actor[name].clone()
        
        for name in critic_1:
            critic_1[name] = tau*critic_1[name].clone() + (1-tau)*target_critic_1[name].clone()
        
        for name in critic_2:
            critic_2[name] = tau*critic_2[name].clone() + (1-tau)*target_critic_2[name].clone()
        
        self.target_actor.load_state_dict(actor)
        self.target_critic_1.load_state_dict(critic_1)
        self.target_critic_2.load_state_dict(critic_2)
    
    def choose_action(self, observation):
        if self.learn_cntr < self.warmup:
            mu = np.random.normal(scale=self.noise1, 
                                  size=self.n_actions)
            mu = T.tensor(mu).to(self.actor.device)
        else:
            state = T.tensor(observation,
                             dtype=T.float).to(self.actor.device)
            mu = self.actor.forward(state)
        noise = T.tensor(np.random.normal(scale=self.noise1,
                                          size=self.n_actions), 
                         dtype=T.float).to(self.actor.device)
        mu_ = T.clamp(T.add(mu, noise), min=self.env.action_space.low[0],
                                        max=self.env.action_space.high[0])
        self.learn_cntr += 1
        return mu_.cpu().detach().numpy()
    
    def save_models(self):
        self.actor.save_checkpoint()
        self.critic_1.save_checkpoint()
        self.critic_2.save_checkpoint()
        self.target_actor.save_checkpoint()
        self.target_critic_1.save_checkpoint()
        self.target_critic_2.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.critic_1.load_checkpoint()
        self.critic_2.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.target_critic_1.load_checkpoint()
        self.target_critic_2.load_checkpoint()
        
    def remember(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)
        
    def sample(self):
        states, actions, rewards, states_, done = \
                                self.memory.sample_buffer(self.batch_size)
        
        states = T.tensor(states, dtype=T.float).to(self.critic_1.device)
        actions = T.tensor(actions, dtype=T.float).to(self.critic_1.device)
        rewards = T.tensor(rewards, dtype=T.float).to(self.critic_1.device)
        states_ = T.tensor(states_, dtype=T.float).to(self.critic_1.device)
        done = T.tensor(done, dtype=T.int).to(self.critic_1.device)
        
        return states, actions, rewards, states_, done
        
    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        
        states, actions, rewards, states_, done = self.sample()

        Vs1 = self.critic_1.forward(states, actions)
        Vs2 = self.critic_2.forward(states, actions)

        actions_ = self.target_actor.forward(states_)

        noise = T.tensor(np.random.normal(scale=self.noise1,
                                          size=self.n_actions), 
                         dtype=T.float).to(self.actor.device)
        noise = T.clamp(noise, min=-self.clamp, max=self.clamp)
        
        actions_ = T.add(actions_, noise)
        actions_ = T.clamp(actions_, min=self.env.action_space.low[0], 
                                     max=self.env.action_space.high[0])

        critic_1_Vs_ = self.target_critic_1.forward(states_, actions_)
        critic_2_Vs_ = self.target_critic_2.forward(states_, actions_)
        min_Vs_ = T.min(critic_1_Vs_, critic_2_Vs_)

        target = rewards + self.gamma*min_Vs_*(1-done)

        self.critic_1.optim.zero_grad()
        self.critic_2.optim.zero_grad()
        critic_1_loss = F.mse_loss(Vs1, target)
        critic_2_loss = F.mse_loss(Vs2, target)
        critic_loss = T.add(critic_1_loss, critic_2_loss)
        critic_loss.backward()
        self.critic_1.optim.step()
        self.critic_2.optim.step()

        if self.learn_cntr % self.delay == 0:
            self.actor.optim.zero_grad()
            actor_loss = self.critic_1.forward(states_, self.actor.forward(states_))
            actor_loss = -T.mean(actor_loss)
            actor_loss.backward()
            self.actor.optim.step()
            
            self.update_target_networks()
示例#28
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env, results_file):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        results_file.write(ActorNetwork.get_settings())

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
示例#29
0
    def __init__(self, input_dims, n_actions, env,
                 fc1_dims, fc2_dims, alpha, beta,
                 gamma, tau, noise1, noise2, clamp,
                 delay, max_size, batch_size, warmup):

        self.gamma = gamma
        self.tau = tau
        self.noise1 = noise1
        self.noise2 = noise2
        self.clamp = clamp
        self.delay = delay
        self.batch_size = batch_size
        self.warmup = warmup
        self.learn_cntr = 0
        self.env = env
        self.n_actions = n_actions

        self.actor = ActorNetwork(
                     input_shape=input_dims,
                     n_actions=n_actions,
                     fc1_dims=fc1_dims,
                     fc2_dims=fc2_dims,
                     alpha=alpha,
                     name='Actor_TD3PG.cpt',
                     checkpoint_dir='tmp/models')

        self.critic_1 = CriticNetwork(
                        input_shape=input_dims,
                        n_actions=n_actions,
                        fc1_dims=fc1_dims,
                        fc2_dims=fc2_dims,
                        beta=beta,
                        name='Critic_1_TD3PG.cpt',
                        checkpoint_dir='tmp/models')

        self.critic_2 = CriticNetwork(
                        input_shape=input_dims,
                        n_actions=n_actions,
                        fc1_dims=fc1_dims,
                        fc2_dims=fc2_dims,
                        beta=beta,
                        name='Critic_2_TD3PG.cpt',
                        checkpoint_dir='tmp/models')

        self.target_actor = ActorNetwork(
                            input_shape=input_dims,
                            n_actions=n_actions,
                            fc1_dims=fc1_dims,
                            fc2_dims=fc2_dims,
                            alpha=alpha,
                            name='Target_Actor_TD3PG.cpt',
                            checkpoint_dir='tmp/models')

        self.target_critic_1 = CriticNetwork(
                               input_shape=input_dims,
                               n_actions=n_actions,
                               fc1_dims=fc1_dims,
                               fc2_dims=fc2_dims,
                               beta=beta,
                               name='Target_Critic_1_TD3PG.cpt',
                               checkpoint_dir='tmp/models')

        self.target_critic_2 = CriticNetwork(
                               input_shape=input_dims, 
                               n_actions=n_actions, 
                               fc1_dims=fc1_dims,
                               fc2_dims=fc2_dims, 
                               beta=beta, 
                               name='Target_Critic_2_TD3PG.cpt',
                               checkpoint_dir='tmp/models')

        self.memory = ReplayBuffer(
                      max_size=max_size, 
                      input_shape=input_dims, 
                      n_actions=n_actions)

        self.update_target_networks()
示例#30
0
class RDPG:
    """docstring for RDPG"""
    def __init__(self, env):
        self.name = 'RDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        self.saver = tf.train.Saver()

    def train(self):
        # Sample a random minibatch of N sequences from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        # Construct histories
        observations = []
        next_observations = []
        actions = []
        rewards = []
        dones = []
        for each in minibatch:
            for i in range(1, len(each.observations)):
                observations.append(self.pad(each.observations[0:i]))
                next_observations.append(self.pad(each.observations[1, i + 1]))
                actions.append(each.actions[0:i - 1])
                rewards.append(each.rewards[0:i])
                if i == len(each.observations) - 1:
                    dones.append(True)
                else:
                    dones.append(False)
        # Calculate y_batch
        next_action_batch = self.actor_network.target_action(observations)
        q_value_batch = self.critic_network.target_q(
            next_observations,
            [self.pad(i + j) for (i, j) in zip(actions, next_action_batch)])
        y_batch = []
        for i in range(len(observations)):
            if dones[i]:
                y_batch.append(rewards[i][-1])
            else:
                y_batch.append(rewards[i][-1] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [len(observations), 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, observations,
                                  [self.pad(i) for i in actions])

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(observations)
        q_gradient_batch = self.critic_network.gradients(
            observations, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, observations)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def save_model(self, path, episode):
        self.saver.save(self.sess, path + "modle.ckpt", episode)

    def noise_action(self, history):
        # Select action a_t according to a sequence of observation and action
        action = self.actor_network.action(history)
        return action + self.exploration_noise.noise()

    def action(self, history):
        action = self.actor_network.action(history)
        return action

    def perceive(self, history):
        # Store the history sequence in the replay buffer
        self.replay_buffer.add(history)

        # Store history to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def pad(self, input):
        dim = len(input[0])
        return input + [[0] * dim] * (1000 - len(input))
示例#31
0
文件: ddpg.py 项目: Ivehui/DDPG
class DDPG:
    """docstring for DDPG"""
    def __init__(self, environment):
        self.name = 'DDPG' # name for uploading results
        self.environment = environment
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.actor_network = ActorNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0])
        self.critic_network = CriticNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0])
        # initialize replay buffer
        self.replay_buffer = deque()

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(environment.action_space.shape[0])

        # Initialize time step
        self.time_step = 0

    def set_init_observation(self,observation):
        # receive initial observation state
        self.state = observation

    def train(self):
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = random.sample(self.replay_buffer,BATCH_SIZE)
        state_batch = [data[0] for data in minibatch]
        action_batch = [data[1] for data in minibatch]
        reward_batch = [data[2] for data in minibatch]
        next_state_batch = [data[3] for data in minibatch]

        action_batch = np.resize(action_batch,[BATCH_SIZE,1])

        # Calculate y
        y_batch = []
        next_action_batch = self.actor_network.target_evaluate(next_state_batch)
        q_value_batch = self.critic_network.target_evaluate(next_state_batch,next_action_batch)
        for i in range(0,BATCH_SIZE):
            done = minibatch[i][4]
            if done:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch,state_batch,action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.evaluate(state_batch)
        q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients)/BATCH_SIZE

        self.actor_network.train(q_gradient_batch,state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def get_action(self):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.get_action(self.state)
        return np.clip(action+self.exploration_noise.noise(),self.environment.action_space.low,self.environment.action_space.high)

    def set_feedback(self,observation,action,reward,done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        next_state = observation
        self.replay_buffer.append((self.state,action,reward,next_state,done))
        # Update current state
        self.state = next_state
        # Update time step
        self.time_step += 1

        # Limit the replay buffer size
        if len(self.replay_buffer) > REPLAY_BUFFER_SIZE:
            self.replay_buffer.popleft()

        # Store transitions to replay start size then start training
        if self.time_step >  REPLAY_START_SIZE:
            self.train()

        if self.time_step % 10000 == 0:
            self.actor_network.save_network(self.time_step)
            self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
示例#32
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, environment):
        self.name = 'DDPG'  # name for uploading results
        self.environment = environment
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.actor_network = ActorNetwork(
            state_size=environment.observation_space.shape[0],
            action_size=environment.action_space.shape[0])
        self.critic_network = CriticNetwork(
            state_size=environment.observation_space.shape[0],
            action_size=environment.action_space.shape[0])
        # initialize replay buffer
        self.replay_buffer = deque()

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(environment.action_space.shape[0])

        # Initialize time step
        self.time_step = 0

    def set_init_observation(self, observation):
        # receive initial observation state
        self.state = observation

    def train(self):
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = random.sample(self.replay_buffer, BATCH_SIZE)
        state_batch = [data[0] for data in minibatch]
        action_batch = [data[1] for data in minibatch]
        reward_batch = [data[2] for data in minibatch]
        next_state_batch = [data[3] for data in minibatch]

        action_batch = np.resize(action_batch, [BATCH_SIZE, 1])

        # Calculate y
        y_batch = []
        next_action_batch = self.actor_network.target_evaluate(
            next_state_batch)
        q_value_batch = self.critic_network.target_evaluate(
            next_state_batch, next_action_batch)
        for i in range(0, BATCH_SIZE):
            done = minibatch[i][4]
            if done:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.evaluate(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients) / BATCH_SIZE

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def get_action(self):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.get_action(self.state)
        return np.clip(action + self.exploration_noise.noise(),
                       self.environment.action_space.low,
                       self.environment.action_space.high)

    def set_feedback(self, observation, action, reward, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        next_state = observation
        self.replay_buffer.append(
            (self.state, action, reward, next_state, done))
        # Update current state
        self.state = next_state
        # Update time step
        self.time_step += 1

        # Limit the replay buffer size
        if len(self.replay_buffer) > REPLAY_BUFFER_SIZE:
            self.replay_buffer.popleft()

        # Store transitions to replay start size then start training
        if self.time_step > REPLAY_START_SIZE:
            self.train()

        if self.time_step % 10000 == 0:
            self.actor_network.save_network(self.time_step)
            self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
示例#33
0
def main():
    ''' Create the environment
    '''
    env = gym.make(ENV_NAME)

    # For tensorboard
    writer = tf.summary.FileWriter("./tensorboard")

    assert STATE_DIM == np.prod(np.array(env.observation_space.shape))
    assert ACTION_DIM == np.prod(np.array(env.action_space.shape))

    env.seed(0)
    np.random.seed(0)
    ''' Create the replay memory
    '''
    replay_memory = Memory(REPLAY_MEM_CAPACITY)

    # Tensorflow part starts here!
    tf.reset_default_graph()
    ''' Create placeholders 
    '''
    # Placeholders
    state_placeholder = tf.placeholder(dtype=tf.float32, \
                                       shape=[None, STATE_DIM],
                                       name='state_placeholder')
    action_placeholder = tf.placeholder(dtype=tf.float32, \
                                        shape=[None, ACTION_DIM],
                                        name='action_placeholder')
    reward_placeholder = tf.placeholder(dtype=tf.float32,
                                        shape=[None],
                                        name='reward_placeholder')
    next_state_placeholder = tf.placeholder(dtype=tf.float32,
                                            shape=[None, STATE_DIM],
                                            name='next_state_placeholder')
    is_not_terminal_placeholder = tf.placeholder(
        dtype=tf.float32, shape=[None], name='is_not_terminal_placeholder')

    is_training_placeholder = tf.placeholder(dtype=tf.float32,
                                             shape=(),
                                             name='is_training_placeholder')
    ''' A counter to count the number of episodes
    '''
    episodes = tf.Variable(0.0, trainable=False, name='episodes')
    episode_incr_op = episodes.assign_add(1)
    ''' Create the actor network inside the actor scope and calculate actions
    '''
    with tf.variable_scope('actor'):
        actor = ActorNetwork(STATE_DIM,
                             ACTION_DIM,
                             HIDDEN_1_ACTOR,
                             HIDDEN_2_ACTOR,
                             HIDDEN_3_ACTOR,
                             trainable=True)
        unscaled_actions = actor.call(state_placeholder)
        ''' Scale the actions to fit within the bounds provided by the 
        environment
        '''
        actions = scale_actions(unscaled_actions, env.action_space.low,
                                env.action_space.high)
    ''' Create the target actor network inside target_actor scope and calculate 
    the target actions. Apply stop_gradient to the target actions so that 
    thier gradient is not computed at any point of time.
    '''
    with tf.variable_scope('target_actor', reuse=False):
        target_actor = ActorNetwork(STATE_DIM,
                                    ACTION_DIM,
                                    HIDDEN_1_ACTOR,
                                    HIDDEN_2_ACTOR,
                                    HIDDEN_3_ACTOR,
                                    trainable=True)

        unscaled_target_actions = target_actor.call(next_state_placeholder)
        ''' Scale the actions to fit within the bounds provided by the 
        environment
        '''
        target_actions_temp = scale_actions(unscaled_target_actions,
                                            env.action_space.low,
                                            env.action_space.low)
        target_actions = tf.stop_gradient(target_actions_temp)
    ''' Create the critic network inside the critic variable scope. Get the 
    Q-values of given actions and Q-values of actions suggested by the actor 
    network.
    '''
    with tf.variable_scope('critic'):
        critic = CriticNetwork(STATE_DIM,
                               ACTION_DIM,
                               HIDDEN_1_CRITIC,
                               HIDDEN_2_CRITIC,
                               HIDDEN_3_CRITIC,
                               trainable=True)

        q_values_of_given_actions = critic.call(state_placeholder,
                                                action_placeholder)
        q_values_of_suggested_actions = critic.call(state_placeholder, actions)
    ''' Create the target critic network inside the target_critic variable 
    scope. Calculate the target Q-values and apply stop_gradient to it.
    '''
    with tf.variable_scope('target_critic', reuse=False):
        target_critic = CriticNetwork(STATE_DIM,
                                      ACTION_DIM,
                                      HIDDEN_1_CRITIC,
                                      HIDDEN_2_CRITIC,
                                      HIDDEN_3_CRITIC,
                                      trainable=True)

        target_q_values_temp = target_critic.call(next_state_placeholder,
                                                  target_actions)
        target_q_values = tf.stop_gradient(target_q_values_temp)
    ''' Calculate 
    - trainable variables in actor (Weights of actor network), 
    - Weights of target actor network
    - trainable variables in critic (Weights of critic network),
    - Weights of target critic network
    '''
    actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='actor')

    target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                          scope='target_actor')

    critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                    scope='critic')

    target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='target_critic')
    ''' Get the operators for updating the target networks. The 
    update_target_networks function defined in utils returns a list of operators 
    to be run from tf session inorder to update the target networks using 
    soft update.
    '''
    update_targets_op = update_target_networks(TAU, \
        target_actor_vars, actor_vars, target_critic_vars, \
            critic_vars)
    ''' Create the tf operation to train the critic network:
    - calculate TD-target 
    - calculate TD-Error = TD-target - q_values_of_given_actions
    - calculate Critic network's loss (Mean Squared Error of TD-Errors)
    - ?
    - create a tf operation to train the critic network
    '''
    targets = tf.expand_dims(reward_placeholder, 1) + \
        tf.expand_dims(is_not_terminal_placeholder, 1) * GAMMA * \
            target_q_values
    td_errors = targets - q_values_of_given_actions
    critic_loss = tf.reduce_mean(tf.square(td_errors))

    # Update critic networks after computing loss
    for var in critic_vars:
        if not 'bias' in var.name:
            critic_loss += L2_REG_CRITIC * 0.5 * tf.nn.l2_loss(var)

    # optimize critic
    critic_train_op = tf.train.AdamOptimizer(
        LEARNING_RATE_CRITIC * LR_DECAY**episodes).minimize(critic_loss)
    ''' Create a tf operation to train the actor networks
    - Calculate the Actor network's loss
    - Create the tf operation to train the actor network
    '''
    # Actor's loss
    actor_loss = -1 * tf.reduce_mean(q_values_of_suggested_actions)
    for var in actor_vars:
        if not 'bias' in var.name:
            actor_loss += L2_REG_ACTOR * 0.5 * tf.nn.l2_loss(var)

    # Optimize actor
    actor_train_op = tf.train.AdamOptimizer(
        LEARNING_RATE_ACTOR * LR_DECAY**episodes).minimize(actor_loss,
                                                           var_list=actor_vars)

    # Init session
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    writer.add_graph(sess.graph)

    # Training
    num_steps = 0
    for episode in range(NUM_EPISODES):
        total_reward = 0
        num_steps_in_episode = 0

        # Create noise
        noise = np.zeros(ACTION_DIM)
        noise_scale = (INITIAL_NOISE_SCALE * NOISE_DECAY ** episode) * \
            (env.action_space.high - env.action_space.low)

        # Initial state
        state = env.reset()

        for _ in range(MAX_STEPS_PER_EPISODE):

            action = sess.run(actions, feed_dict={ \
                state_placeholder: state[None],
                is_training_placeholder: False})

            # Add Noise to actions
            noise = EXPLORATION_THETA * (EXPLORATION_MU - noise) + \
                EXPLORATION_SIGMA * np.random.randn(ACTION_DIM)

            action += noise_scale * noise

            # Take action on env
            next_state, reward, done, _info = env.step(action)
            next_state = np.squeeze(next_state)
            reward = np.squeeze(reward)
            action = action[0]

            total_reward += reward

            replay_memory.add_to_memory(
                (state, action, reward, next_state, 0.0 if done else 1.0))

            if num_steps % TRAIN_EVERY == 0 and replay_memory.size() >= \
                MINI_BATCH_SIZE :
                batch = replay_memory.sample_from_memory(MINI_BATCH_SIZE)
                _, _ = sess.run([critic_train_op, actor_train_op],
                    feed_dict={
                        state_placeholder: np.asarray( \
                            [elem[0] for elem in batch]),
                        action_placeholder: np.asarray( \
                            [elem[1] for elem in batch]),
                        reward_placeholder: np.asarray( \
                            [elem[2] for elem in batch]),
                        next_state_placeholder: np.asarray( \
                            [elem[3] for elem in batch]),
                        is_not_terminal_placeholder: np.asarray( \
                            [elem[4] for elem in batch]),
                        is_training_placeholder: True
                })

                _ = sess.run(update_targets_op)

            state = next_state
            num_steps += 1
            num_steps_in_episode += 1

            if done:
                _ = sess.run(episode_incr_op)
                break

        print(str((episode, total_reward, num_steps_in_episode, noise_scale)))

    env.close()
示例#34
0
class DDPG:

    def __init__(self, state_dim, state_channel, action_dim):
        self.state_dim = state_dim
        self.state_channel = state_channel
        self.action_dim = action_dim

        self.sess = tf.InteractiveSession()
        self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.target_state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.action_input = tf.placeholder('float', [None, action_dim])

        self.actor_network = ActorNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)

        # create network
        self.actor_network.create_network(self.state_input)
        self.critic_network.create_q_network(self.state_input, self.actor_network.action_output)

        # create target network
        self.actor_network.create_target_network(self.target_state_input)
        self.critic_network.create_target_q_network(self.target_state_input, self.actor_network.target_action_output)

        # create training method
        self.actor_network.create_training_method(self.critic_network.q_value_output)
        self.critic_network.create_training_method()

        self.sess.run(tf.initialize_all_variables())
        self.actor_network.update_target()
        self.critic_network.update_target()

        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        self.exploration_noise = OUNoise(self.action_dim)

        self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/models_ddpg'
        if not os.path.exists(self.dir_path):
            os.mkdir(self.dir_path)

        # for log
        self.reward_input = tf.placeholder(tf.float32)
        tf.scalar_summary('reward', self.reward_input)
        self.time_input = tf.placeholder(tf.float32)
        tf.scalar_summary('living_time', self.time_input)
        self.summary_op = tf.merge_all_summaries()
        self.summary_writer = tf.train.SummaryWriter(self.dir_path + '/log', self.sess.graph)

        self.episode_reward = 0.0
        self.episode_start_time = 0.0

        self.time_step = 1
        self.saver = tf.train.Saver(tf.all_variables())
        self.load_time_step()
        self.load_network()
        return

    def train(self):
        action_dim = self.action_dim

        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)  # sample BATCH_SIZE from replay_buffer
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # if action_dim = 1, it's a number not a array
        action_batch = np.resize(action_batch, [BATCH_SIZE, action_dim])

        # calculate y_batch via target network
        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q_value(next_state_batch, next_action_batch)

        y_batch = []
        for i in range(BATCH_SIZE):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # print np.shape(reward_batch), np.shape(y_batch)

        # train actor network
        self.actor_network.train(state_batch)

        # train critic network
        self.critic_network.train(y_batch, state_batch, action_batch)

        # update target network
        self.actor_network.update_target()
        self.critic_network.update_target()
        return

    def noise_action(self, state):
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def _record_log(self, reward, living_time):
        summary_str = self.sess.run(self.summary_op, feed_dict={
            self.reward_input: reward,
            self.time_input: living_time
        })
        self.summary_writer.add_summary(summary_str, self.time_step)
        return

    def perceive(self, state, action, reward, next_state, done):
        self.replay_buffer.add(state, action, reward, next_state, done)
        if self.episode_start_time == 0.0:
            self.episode_start_time = time.time()
        # for testing
        # self.time_step += 1
        # if self.time_step == 100:
        #     print '--------------------------------'
        #     self.replay_buffer.save_to_pickle()
        # return
        
        self.episode_reward += reward
        living_time = time.time() - self.episode_start_time
        if self.time_step % 1000 == 0 or done:
            self._record_log(self.episode_reward, living_time)

        if self.replay_buffer.size() > REPLAY_START_SIZE:
            self.train()

        if self.time_step % 100000 == 0:
            self.save_network()

        if done:
            print '===============reset noise========================='
            self.exploration_noise.reset()
            self.episode_reward = 0.0
            self.episode_start_time = time.time()

        self.time_step += 1
        return

    def load_time_step(self):
        if not os.path.exists(self.dir_path):
            return
        files = os.listdir(self.dir_path)
        step_list = []
        for filename in files:
            if ('meta' in filename) or ('-' not in filename):
                continue
            step_list.append(int(filename.split('-')[-1]))
        step_list = sorted(step_list)
        if len(step_list) == 0:
            return
        self.time_step = step_list[-1] + 1
        return

    def load_network(self):
        checkpoint = tf.train.get_checkpoint_state(self.dir_path)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print 'Successfully loaded:', checkpoint.model_checkpoint_path
        else:
            print 'Could not find old network weights'
        return

    def save_network(self):
        print 'save actor-critic network...', self.time_step
        self.saver.save(self.sess, self.dir_path + '/ddpg', global_step=self.time_step)
        return