def __init__(self, state_size, action_size, random_seed, hyperparams): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.hyperparams = hyperparams self.actor = Actor(state_size, action_size, random_seed).to(device) self.actor_noise = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optim = optim.Adam(self.actor.parameters(), lr=hyperparams.alpha_actor) self.critic = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optim = optim.Adam( self.critic.parameters(), lr=hyperparams.alpha_critic, weight_decay=hyperparams.weight_decay, ) self.replay_buffer = ReplayBuffer(hyperparams.buffer_size, hyperparams.batch_size, random_seed) self.noise = OUNoise( action_size, random_seed, self.hyperparams.mu, self.hyperparams.theta, self.hyperparams.sigma, )
def __init__(self, state_size, action_size, seed): self.gradient_clipping = True self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.config = Config() # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.config.LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.config.LR_CRITIC, weight_decay=self.config.WEIGHT_DECAY) self.noise = OUNoise(action_size, seed) self.memory = ReplayBuffer(action_size, self.config.BUFFER_SIZE, self.config.BATCH_SIZE, seed, device) self.step_count = 0
def __init__(self, state_size, action_size, agent_id): """Initialize a DDPGAgent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action agent_id (int): identifier for this agent """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(RANDOM_SEED) self.agent_id = agent_id self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.critic_local = Critic(state_size, action_size).to(device) self.critic_target = Critic(state_size, action_size).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Make sure that the target-local model pairs are initialized to the # same weights self.hard_update(self.actor_local, self.actor_target) self.hard_update(self.critic_local, self.critic_target) self.noise = OUNoise(action_size)
def __init__(self, env, replay_buffer, sample_batch, train_iter, gamma, tau, batch_size, n_train, n_episode): # Gym environment self.env = env env_flattened = gym.wrappers.FlattenDictWrapper( env, dict_keys=['observation', 'achieved_goal', 'desired_goal']) # Get space sizes self.state_dim = env_flattened.observation_space.shape[0] #self.state_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] # Get replay buffer and function get a batch from it self.replay_buffer = replay_buffer self.sample_batch = sample_batch self.sess = tf.InteractiveSession() # Hyper parameters self.gamma = gamma self.tau = tau self.batch_size = batch_size self.n_train = n_train self.n_episode = n_episode # Initialize networks self.critic = CriticNetwork(self.sess, self.state_dim, self.action_dim) self.actor = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.exploration_noise = OUNoise(self.action_dim)
def __init__(self, env, state_dim=None): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # along with their target networks if state_dim: self.state_dim = state_dim print(self.state_dim) else: self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) # Flag to signal save self.not_saved = True #For normalisation self.state_mean = 0 self.state_std = 1 self.target_mean = 0 self.target_std = 1
def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks # self.state_dim = env.observation_space.shape[0] # self.action_dim = env.action_space.shape[0] self.state_dim = env.state_size self.action_dim = env.action_size self.action_bound = (env.action_high - env.action_low) / 2 print('state_dim: ', self.state_dim, 'action_dim: ', self.action_dim, 'action_bound: ', self.action_bound) self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim, self.action_bound) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim, self.action_bound) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim)
def __init__(self, state_item_num, action_item_num, emb_dim, batch_size, tau, actor_lr, critic_lr, gamma, buffer_size, item_space, summary_dir): self.state_item_num = state_item_num self.action_item_num = action_item_num self.emb_dim = emb_dim self.batch_size = batch_size self.tau = tau self.actor_lr = actor_lr self.critic_lr = critic_lr self.gamma = gamma self.buffer_size = buffer_size self.item_space = item_space self.summary_dir = summary_dir self.sess = tf.Session() self.s_dim = emb_dim * state_item_num self.a_dim = emb_dim * action_item_num self.actor = Actor(self.sess, state_item_num, action_item_num, emb_dim, batch_size, tau, actor_lr) self.critic = Critic(self.sess, state_item_num, action_item_num, emb_dim, self.actor.get_num_trainable_vars(), gamma, tau, critic_lr) self.exploration_noise = OUNoise(self.a_dim) # set up summary operators self.summary_ops, self.summary_vars = self.build_summaries() self.sess.run(tf.global_variables_initializer()) self.writer = tf.summary.FileWriter(summary_dir, self.sess.graph) # initialize target network weights self.actor.hard_update_target_network() self.critic.hard_update_target_network() # initialize replay memory self.replay_buffer = ReplayBuffer(buffer_size)
def __init__(self, env, time_steps, hidden_dim): self.name = 'DDPG' # name for uploading results self.scale = env.asset self.unit = env.unit self.seed = env.rd_seed self.time_dim = time_steps self.state_dim = env.observation_space.shape[1] self.action_dim = env.action_space.shape[0] self.batch_size = 64 self.memory_size = self.time_dim + self.batch_size * 10 self.start_size = self.time_dim + self.batch_size * 2 # Initialise actor & critic networks self.actor_network = Actor(self.time_dim, self.state_dim, self.action_dim, hidden_dim) self.critic_network = Critic(self.time_dim, self.state_dim, self.action_dim, hidden_dim) # Initialize replay buffer self.replay_state = torch.zeros( (self.start_size - 1, 3, self.state_dim), device=cuda) self.replay_next_state = torch.zeros( (self.start_size - 1, 3, self.state_dim), device=cuda) self.replay_action = torch.zeros( (self.start_size - 1, 1, self.state_dim), device=cuda) self.replay_reward = torch.zeros((self.start_size - 1, ), device=cuda) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim, sigma=0.01 / self.action_dim) self.initial()
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters
def __init__(self, sess, number, model_path, global_episodes, explore, decay, training): self.name = 'worker_' + str(number) # name for uploading results self.number = number # Randomly initialize actor network and critic network # with both their target networks self.state_dim = 41 self.action_dim = 18 self.model_path = model_path self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.sess = sess self.explore = explore self.decay = decay self.training = training self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim, self.name + '/actor') self.actor_network.update_target(self.sess) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim, self.name + '/critic') self.critic_network.update_target(self.sess) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.update_local_ops_actor = update_target_graph( 'global/actor', self.name + '/actor') self.update_local_ops_critic = update_target_graph( 'global/critic', self.name + '/critic')
def __init__(self, env, DIRECTORY): self.batch_size = BATCH_SIZE self.replay_start_size = REPLAY_START_SIZE # self.sub_batch_size = BATCH_SIZE / n_gpu self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) self.trace_length = TRACE_LENGTH self.temp_abstract = TEMP_ABSTRACT self.actor_network = ActorNetwork(self.sess, BATCH_SIZE, self.state_dim, self.action_dim, self.temp_abstract, DIRECTORY) self.critic_network = CriticNetwork(self.sess, BATCH_SIZE, self.state_dim, self.action_dim, self.temp_abstract, DIRECTORY) # initialize replay buffer max_len_trajectory = self.environment.spec.timestep_limit + 1 # trace_length self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE, DIRECTORY, max_len_trajectory, self.actor_network.last_epi) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) ### self.diff = 0. self.discounting_mat_dict = {}
def __init__(self, shape_in, num_output, accele_range, angle_range): self.input_shape = shape_in self.out_shape = num_output self.learning_rate_a = LEARNING_RATE_ACTOR self.learning_rate_c = LEARNING_RATE_CRITIC self.memory = deque(maxlen=MAX_MEMORY_LEN) self.train_start = 200 self.batch_size = 64 self.gamma = 0.9 self.sigma_fixed = 2 self.channel = CHANNEL self.critic_input_action_shape = 1 self.angle_range = angle_range self.accele_range = accele_range self.actor_model = self.actor_net_builder() self.critic_model = self.critic_net_build() self.actor_target_model = self.actor_net_builder() self.critic_target_model = self.critic_net_build() self.OUnoise = OUNoise(2) # self.actor_target_model.trainable = False # self.critic_target_model.trainable = False self.actor_history = [] self.critic_history = [] self.reward_history = [] self.weight_hard_update()
def __init__(self, env): self.action_dim = env.action_space.shape[0] self.state_dim = env.observation_space.shape[0] self.h1_dim = 400 self.h2_dim = 300 self.actor_learning_rate = 1e-4 self.critic_learning_rate = 1e-3 self.gamma = 0.99 # Ornstein-Uhlenbeck noise parameters self.noise_theta = 0.15 self.noise_sigma = 0.20 self.ou = OUNoise(self.action_dim, theta=self.noise_theta, sigma=self.noise_sigma) self.replay_buffer_size = 1000000 self.replay_buffer = deque(maxlen=self.replay_buffer_size) self.replay_start_size = 1000 self.batch_size = 64 self.target_update_rate = 0.001 self.total_parameters = 0 self.global_steps = 0 self.reg_param = 0.01
def __init__(self, state_size, action_size, seed, n_hidden_units=128, n_layers=3): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # actor self.actor = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_opt = optim.Adam(self.actor.parameters(), lr=1e-4) # critic self.critic = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_opt = optim.Adam(self.critic.parameters(), lr=3e-4, weight_decay=0.0001) # will add noise self.noise = OUNoise(action_size, seed) # experience replay self.replay = ReplayBuffer(seed)
def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] # self.state_dim = env.observation_space.shape[0] * 2 self.action_dim = env.action_space.shape[0] self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration # self.exploration_noise = OUNoise(self.action_dim) self.exploration_noise = OUNoise() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(MODEL_PATH) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) my_config.logger.warn("Successfully loaded: %s" % (checkpoint.model_checkpoint_path)) else: my_config.logger.error("Could not find old network weights")
def reset(self): # 매 episode가 시작될때 사용됨. # 사용 변수들 초기화 # 매 에피소드마다 로봇을 원점으로 이동시키려면 아래 주석을 해제한다. self.robot_tf = Transformation() self.joint1_tf = Transformation() self.link1_tf = Transformation(translation=(self.link1_len, 0)) self.joint2_tf = Transformation() self.link2_tf = Transformation(translation=(self.link2_len, 0)) self.link1_tf_global = self.robot_tf * self.joint1_tf * self.link1_tf self.link2_tf_global = self.link1_tf_global * self.joint2_tf * self.link2_tf self.step_count = 0.00 # 목표 지점 생성 self.target_tf = Transformation(translation=( random.randrange(-self.env_boundary, self.env_boundary), random.randrange(-self.env_boundary, self.env_boundary))) self.ou = OUNoise(dt=self.dt, theta=0.1, sigma=0.2) self.done = False self.t = 0 self.buffer = [] # 시각화를 위한 버퍼. episode가 리셋될 때마다 초기화. # Step 함수와 다르게 reset함수는 초기 state 값 만을 반환합니다. return self._get_state()
def main(): experiment = 'model-builder-v0' #specify environments here env = gym.make(experiment) #steps= env.spec.timestep_limit #steps per episode steps = 20 assert isinstance(env.observation_space, Box), "observation space must be continuous" assert isinstance(env.action_space, Box), "action space must be continuous" #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env, is_batch_norm) exploration_noise = OUNoise(env.action_space.shape[0]) counter = 0 reward_per_episode = 0 total_reward = 0 num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] print("Number of States:", num_states) print("Number of Actions:", num_actions) print("Number of Steps per episode:", steps) #saving reward: reward_st = np.array([0]) for i in range(episodes): print("==== Starting episode no:", i, "====", "\n") observation = env.reset() reward_per_episode = 0 for t in range(steps): #rendering environmet (optional) env.render() x = observation action = agent.evaluate_actor(np.reshape(x, [1, 300, 300, 2])) noise = exploration_noise.noise() action = action[ 0] + noise #Select action according to current policy and exploration noise print("Action at step", t, " :", action, "\n") observation, reward, done, info = env.step(action) #add s_t,s_t+1,action,reward to experience memory agent.add_experience(x, observation, action, reward, done) #train critic and actor network if counter > 64: agent.train() reward_per_episode += reward counter += 1 #check if episode ends: if (done or (t == steps - 1)): print('EPISODE: ', i, ' Steps: ', t, ' Total Reward: ', reward_per_episode) print("Printing reward to file") exploration_noise.reset( ) #reinitializing random noise for action exploration reward_st = np.append(reward_st, reward_per_episode) np.savetxt('episode_reward.txt', reward_st, newline="\n") print('\n\n') break total_reward += reward_per_episode print("Average reward per episode {}".format(total_reward / episodes))
def learn(self, total_timesteps, callback): ou_scale = 1.0 # initial scaling factor ou_decay = 0.9995 # decay of the scaling factor ou_scale ou_mu = 0.0 # asymptotic mean of the noise ou_theta = 0.15 # magnitude of the drift term ou_sigma = 0.20 # magnitude of the diffusion term # this slowly decreases to 0 # create the noise process noise_process = OUNoise(self.action_size, ou_mu, ou_theta, ou_sigma) # create the replay buffer buffer = ReplayBuffer(seed=self.seed, action_size=self.action_size, buffer_size=self.buffer_size, batch_size=self.batch_size, device=self.device) self.t_step = 0 episode = 0 while self.t_step < total_timesteps: callback.on_start_episode(episode) episode_scores = np.zeros(self.env.num_agents) states, _, _ = self.env.reset() scores = np.zeros(2) while True: states = np.reshape(states, (1, 48)) # reshape so we can feed both agents states to each agent # split into the states into the parts observed by each agent states_0 = states[0, :24].reshape((1, 24)) states_1 = states[0, 24:].reshape((1, 24)) # generate noise noise = ou_scale * noise_process.get_noise().reshape((1, 4)) # split the noise into the parts for each agent noise_0 = noise[0, :2].reshape((1, 2)) noise_1 = noise[0, 2:].reshape((1, 2)) # determine actions for the unity agents from current sate, using noise for exploration actions_0 = self.player_policy.act(states_0, use_target=False, add_noise=True, noise_value=noise_0)\ .detach().cpu().numpy() actions_1 = self.opponent_policy.act(states_1, use_target=False, add_noise=True, noise_value=noise_1)\ .detach().cpu().numpy() actions = np.vstack((actions_0.flatten(), actions_1.flatten())) # take the action in the environment next_states, rewards, dones, info = self.env.step(actions) # store (S, A, R, S') info in the replay buffer (memory) buffer.add(states.flatten(), actions.flatten(), rewards, next_states.flatten(), dones) episode_scores += rewards states = next_states self.t_step += 1 """ Policy learning """ ## train the agents if we have enough replays in the buffer if len(buffer) >= self.batch_size: self.player_policy.learn(buffer.sample(), self.opponent_policy) self.opponent_policy.learn(buffer.sample(), self.player_policy) if np.any(dones): break if not callback.on_step(np.max(episode_scores), self.t_step): break # decrease the scaling factor of the noise ou_scale *= ou_decay episode += 1
def __init__(self, model, env, sess, num_episodes, direction): self.model = model self.sess = sess self.direction = direction self.env = env self.num_episodes = num_episodes self.episode_start = 0 self.noise = OUNoise(mu=np.zeros(self.env.action_space.shape)) self.noise_decay = 0.2 self.epsilon = EPSILON self.epsilon_decay = nth_root(self.num_episodes, 0.001 / self.epsilon) self.count_exp_replay = 0 self.tau = TAU self.target_Q_ph = tf.placeholder(tf.float32, shape=(None, 1)) self.actions_grads_ph = tf.placeholder( tf.float32, shape=((None, ) + self.env.action_space.shape)) #train operation self.actor_train_ops = self.model.Actor.train_step( self.actions_grads_ph) self.critic_train_ops = self.model.Critic.train_step(self.target_Q_ph) #update operation self.update_critic_target = self.model.update_target_network( self.model.Critic.network_params, self.model.Critic_target.network_params, self.tau) self.update_actor_target = self.model.update_target_network( self.model.Actor.network_params, self.model.Actor_target.network_params, self.tau) sess.run(tf.initialize_all_variables()) #for testing only self.sess.run( self.model.update_target_network( self.model.Critic.network_params, self.model.Critic_target.network_params)) self.sess.run( self.model.update_target_network( self.model.Actor.network_params, self.model.Actor_target.network_params)) # reward summary for tensorboard self.tf_reward = tf.Variable(0.0, trainable=False, name='reward_summary') self.tf_reward_summary = tf.summary.scalar("Reward by episode", self.tf_reward) # time self.tf_time = tf.Variable(0.0, trainable=False, name='Time_per_episode') self.tf_time_summary = tf.summary.scalar("Time per episode", self.tf_time) # writer self.writer = tf.summary.FileWriter('./graphs', self.sess.graph)
def main(): experiment= 'InvertedPendulum-v1' #specify environments here env= gym.make(experiment) steps= env.spec.timestep_limit #steps per episode assert isinstance(env.observation_space, Box), "observation space must be continuous" assert isinstance(env.action_space, Box), "action space must be continuous" #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env, is_batch_norm) exploration_noise = OUNoise(env.action_space.shape[0]) counter=0 reward_per_episode = 0 total_reward=0 num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] print "Number of States:", num_states print "Number of Actions:", num_actions print "Number of Steps per episode:", steps #saving reward: reward_st = np.array([0]) for i in xrange(episodes): print "==== Starting episode no:",i,"====","\n" observation = env.reset() reward_per_episode = 0 for t in xrange(steps): #rendering environmet (optional) env.render() x = observation action = agent.evaluate_actor(np.reshape(x,[1,num_states])) noise = exploration_noise.noise() action = action[0] + noise #Select action according to current policy and exploration noise print "Action at step", t ," :",action,"\n" observation,reward,done,info=env.step(action) #add s_t,s_t+1,action,reward to experience memory agent.add_experience(x,observation,action,reward,done) #train critic and actor network if counter > 64: agent.train() reward_per_episode+=reward counter+=1 #check if episode ends: if (done or (t == steps-1)): print 'EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode print "Printing reward to file" exploration_noise.reset() #reinitializing random noise for action exploration reward_st = np.append(reward_st,reward_per_episode) np.savetxt('episode_reward.txt',reward_st, newline="\n") print '\n\n' break total_reward+=reward_per_episode print "Average reward per episode {}".format(total_reward / episodes)
def __init__(self, env, state_size, action_size): self.env = env self.replay_memory = deque() self.actor_network = actor_network.ActorNetwork( state_size, action_size) self.critic_network = critic_network.CriticNetwork( state_size, action_size) self.ou_noise = OUNoise(action_size) self.time_step = 0
def __init__(self, env, args): self.direction = args.direction self.env = env self.num_episodes = args.episodes self.episode_start = 0 self.noise = OUNoise(mu=np.zeros(self.env.action_space.shape)) self.noise_decay = args.noise_decay self.count_exp_replay = 0 self.train_iteration = 0 self.tau = args.TAU self.tools = Tools()
def main(): #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env, is_batch_norm, CA_OBS_SPACE, CA_ACTION_SPACE, CA_ACTION_BOUND) exploration_noise = OUNoise(CA_ACTION_SPACE) counter = 0 reward_per_episode = 0 total_reward = 0 num_states = CA_OBS_SPACE num_actions = CA_ACTION_SPACE print "Number of States:", num_states print "Number of Actions:", num_actions print "Number of Steps per episode:", steps #saving reward: reward_st = np.array([0]) for i in xrange(episodes): print "==== Starting episode no:", i, "====", "\n" # observation = env.reset() observation = ca_reset() reward_per_episode = 0 for t in xrange(steps): #rendering environmet (optional) # env.render() x = observation action = agent.evaluate_actor(np.reshape(x, [1, num_states])) noise = exploration_noise.noise() action = action[ 0] + noise #Select action according to current policy and exploration noise print "Action at step", t, " :", action, "\n" # observation,reward,done,info=env.step(action) observation, reward, done, info = ca_step(action) print x, observation, action, reward, done #add s_t,s_t+1,action,reward to experience memory agent.add_experience(x, observation, action, reward, done) #train critic and actor network if counter > 64: agent.train() reward_per_episode += reward counter += 1 #check if episode ends: if (done or (t == steps - 1)): print 'EPISODE: ', i, ' Steps: ', t, ' Total Reward: ', reward_per_episode print "Printing reward to file" exploration_noise.reset( ) #reinitializing random noise for action exploration reward_st = np.append(reward_st, reward_per_episode) np.savetxt('episode_reward.txt', reward_st, newline="\n") print '\n\n' break total_reward += reward_per_episode print "Average reward per episode {}".format(total_reward / episodes)
def __init__(self, task, train=True): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Set the learning rate suggested by paper: https://pdfs.semanticscholar.org/71f2/03de1a53deae81a7707143f0ed564661e279.pdf self.actor_learning_rate = 0.001 self.actor_decay = 0.0 self.critic_learning_rate = 0.001 self.critic_decay = 0.0 # Actor Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate, self.actor_decay) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate, self.actor_decay) # Critic Model self.critic_local = Critic(self.state_size, self.action_size, self.critic_learning_rate, self.critic_decay) self.critic_target = Critic(self.state_size, self.action_size, self.critic_learning_rate, self.critic_decay) # initialize targets model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 # self.exploration_theta = 0.15 # self.exploration_sigma = 0.2 self.exploration_theta = 0.01 self.exploration_sigma = 0.02 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.best_w = None self.best_score = -np.inf # self.noise_scale = 0.7 self.score = 0 # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # Indicate if we want to learn (or use to predict without learn) self.set_train(train)
def main(): env = Env(19997) steps= 10000 num_states = 59 num_actions = 3 #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env, is_batch_norm) exploration_noise = OUNoise(num_actions) counter=0 reward_per_episode = 0 total_reward=0 reward_st = np.array([0]) agent.actor_net.load_actor(os.getcwd() + '/weights/actor/model.ckpt') agent.critic_net.load_critic(os.getcwd() + '/weights/critic/model.ckpt') for i in range(episodes): # print "==== Starting episode no:",i,"====","\n" observation = env.reset() done =False reward_per_episode = 0 for t in range(steps): x = observation action = agent.evaluate_actor(np.reshape(x,[1,num_states])) noise = exploration_noise.noise() action = action[0] + noise #Select action according to current policy and exploration noise for i in range(num_actions): if action[i] > 1.0: action[i] = 1.0 if action[i] < -1.0: action[i] = -1.0 observation,reward,done = env.step(action) print("reward:", reward, "\n") agent.add_experience(x,observation,action,reward,done) #train critic and actor network if counter > 64: agent.train() reward_per_episode+=reward counter+=1 #check if episode ends: if (done or (t == steps-1)): print('Episode',i,'Steps: ',t,'Episode Reward:',reward_per_episode) exploration_noise.reset() reward_st = np.append(reward_st,reward_per_episode) np.savetxt('episode_reward.txt', reward_st, newline="\n") agent.actor_net.save_actor(os.getcwd() + '/weights/actor/model.ckpt') agent.critic_net.save_critic(os.getcwd() + '/weights/critic/model.ckpt') break total_reward+=reward_per_episode
def __init__(self, sess, data_fname): self.name = 'DDPG' # Randomly initialize actor network and critic network # with both their target networks self.name = 'DDPG' # name for uploading results # Randomly initialize actor network and critic network # with both their target networks self.state_dim = Hp.state_dim self.action_dim = Hp.action_dim print(self.state_dim, self.action_dim) self.sess = sess self.state_input = [ tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord)) for _ in xrange(Hp.categories) ] #tf.placeholder("float",[None,self.state_dim]) self.target_state_input = [ tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord)) for _ in xrange(Hp.categories) ] #tf.placeholder("float",[None,self.state_dim]) self.state_network = StateEnc(self.sess, self.state_input, self.target_state_input) state_batch = self.state_network.encoding next_state_batch = self.state_network.target_encoding weights, biases, w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 = self.state_network.get_parameters( ) state_network_params = weights + biases + [ w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 ] self.actor_network = ActorNetwork(self.sess, Hp.n_hidden, self.action_dim, self.state_input, state_batch, next_state_batch, state_network_params) self.critic_network = CriticNetwork(self.sess, Hp.n_hidden, self.action_dim, state_batch, next_state_batch) # initialize replay buffer self.replay_buffer = ReplayBuffer(Hp.REPLAY_BUFFER_SIZE, data_fname) self.summary_str2 = None # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim)
def __init__(self): super(FirstAgent, self).__init__() # actor models self.actor_local = None self.actor_target = None # critic models self.critic_local = None self.critic_target = None # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters
def __init__(self, state_space, action_dim): self.name = 'DDPG' # name for uploading results self.sess = tf.Session() # Randomly initialize actor network and critic network # with both their target networks self.state_space = state_space self.action_dim = action_dim # 1 self.ac_network = ActorCriticNetwork(self.sess, self.state_space, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim)
def __init__(self, env,device): self.name = 'DDPG' # name for uploading results self.environment = env self.device=device # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.actor_network = ActorNetwork(self.state_dim,self.action_dim) self.critic_network = CriticNetwork(self.state_dim,self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim)
def __init__(self, env): self.sess = tf.InteractiveSession() #self.params = loadparams() # ??? self.env = env self.n_states = env.observation_space.shape[0] self.n_actions = env.action_space.shape[0] self.low = self.env.action_space.low self.high = self.env.action_space.high self.actor_network = ActorNetwork(self.sess, self.n_states, self.n_actions) self.trainable_var_count = self.actor_network.get_trainable_var_count() self.critic_network = CriticNetwork(self.sess, self.n_states, self.n_actions, \ self.actor_network, self.trainable_var_count) self.replay_buffer = ReplayBuffer(BUFFER_SIZE) #params['buffer_size']??? self.exploration_noise = OUNoise(self.n_actions) # self.noise = Noise() self.gamma = GAMMA self.sess.run(tf.global_variables_initializer())
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.08 self.exploration_sigma = 0.15 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.95 # discount factor 0.99 self.tau = 0.001 # for soft update of target parameters 0.01 # Score tracker and learning parameters self.total_reward = None self.count = 0 self.score = 0 self.best_score = -np.inf self.last_state = None
def create_networks_and_training_method(self,state_dim,action_dim): theta_p = networks.theta_p(state_dim,action_dim) theta_q = networks.theta_q(state_dim,action_dim) target_theta_p,target_update_p = self.exponential_moving_averages(theta_p,TAU) target_theta_q,target_update_q = self.exponential_moving_averages(theta_q,TAU) self.state = tf.placeholder(tf.float32,[None,state_dim],'state') self.action_test = networks.policy_network(self.state,theta_p) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration = OUNoise(action_dim) noise = self.exploration.noise() self.action_exploration = self.action_test + noise q = networks.q_network(self.state,self.action_test,theta_q) # policy optimization mean_q = tf.reduce_mean(q) weight_decay_p = tf.add_n([L2_POLICY * tf.nn.l2_loss(var) for var in theta_p]) loss_p = -mean_q + weight_decay_p optim_p = tf.train.AdamOptimizer(P_LEARNING_RATE) grads_and_vars_p = optim_p.compute_gradients(loss_p, var_list=theta_p) optimize_p = optim_p.apply_gradients(grads_and_vars_p) with tf.control_dependencies([optimize_p]): self.train_p = tf.group(target_update_p) # q optimization self.action_train = tf.placeholder(tf.float32,[None,action_dim],'action_train') self.reward = tf.placeholder(tf.float32,[None],'reward') self.next_state = tf.placeholder(tf.float32,[None,state_dim],'next_state') self.done = tf.placeholder(tf.bool,[None],'done') q_train = networks.q_network(self.state,self.action_train,theta_q) next_action = networks.policy_network(self.next_state,theta=target_theta_p) next_q = networks.q_network(self.next_state,next_action,theta=target_theta_q) q_target = tf.stop_gradient(tf.select(self.done,self.reward,self.reward + GAMMA * next_q)) # q loss q_error = tf.reduce_mean(tf.square(q_target - q_train)) weight_decay_q = tf.add_n([L2_Q * tf.nn.l2_loss(var) for var in theta_q]) loss_q = q_error + weight_decay_q optim_q = tf.train.AdamOptimizer(Q_LEARNING_RATE) grads_and_vars_q = optim_q.compute_gradients(loss_q, var_list=theta_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) with tf.control_dependencies([optimize_q]): self.train_q = tf.group(target_update_q) tf.scalar_summary("loss_q",loss_q) tf.scalar_summary("loss_p",loss_p) tf.scalar_summary("q_mean",mean_q) global merged_summary_op merged_summary_op = tf.merge_all_summaries()
def __init__(self, state_dim, state_channel, action_dim): self.state_dim = state_dim self.state_channel = state_channel self.action_dim = action_dim self.sess = tf.InteractiveSession() self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel]) self.target_state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel]) self.action_input = tf.placeholder('float', [None, action_dim]) self.actor_network = ActorNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim) # create network self.actor_network.create_network(self.state_input) self.critic_network.create_q_network(self.state_input, self.actor_network.action_output) # create target network self.actor_network.create_target_network(self.target_state_input) self.critic_network.create_target_q_network(self.target_state_input, self.actor_network.target_action_output) # create training method self.actor_network.create_training_method(self.critic_network.q_value_output) self.critic_network.create_training_method() self.sess.run(tf.initialize_all_variables()) self.actor_network.update_target() self.critic_network.update_target() self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.exploration_noise = OUNoise(self.action_dim) self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/models_ddpg' if not os.path.exists(self.dir_path): os.mkdir(self.dir_path) # for log self.reward_input = tf.placeholder(tf.float32) tf.scalar_summary('reward', self.reward_input) self.time_input = tf.placeholder(tf.float32) tf.scalar_summary('living_time', self.time_input) self.summary_op = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(self.dir_path + '/log', self.sess.graph) self.episode_reward = 0.0 self.episode_start_time = 0.0 self.time_step = 1 self.saver = tf.train.Saver(tf.all_variables()) self.load_time_step() self.load_network() return
def __init__(self, environment): self.name = 'DDPG' # name for uploading results self.environment = environment # Randomly initialize actor network and critic network # with both their target networks self.actor_network = ActorNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0]) self.critic_network = CriticNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0]) # initialize replay buffer self.replay_buffer = deque() # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(environment.action_space.shape[0]) # Initialize time step self.time_step = 0
def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim) self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim)
def __init__(self, env, args): self.action_dim = env.action_space.shape[0] self.state_dim = env.observation_space.shape[0] self.actor_lr = args.a_lr self.critic_lr = args.c_lr self.gamma = args.gamma # Ornstein-Uhlenbeck noise parameters self.ou = OUNoise( self.action_dim, theta=args.noise_theta, sigma=args.noise_sigma) self.replay_buffer = deque(maxlen=args.buffer_size) self.replay_start_size = args.replay_start_size self.batch_size = args.batch_size self.target_update_rate = args.target_update_rate self.total_parameters = 0 self.global_steps = 0 self.reg_param = args.reg_param
class DDPG: def __init__(self, state_dim, state_channel, action_dim): self.state_dim = state_dim self.state_channel = state_channel self.action_dim = action_dim self.sess = tf.InteractiveSession() self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel]) self.target_state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel]) self.action_input = tf.placeholder('float', [None, action_dim]) self.actor_network = ActorNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim) # create network self.actor_network.create_network(self.state_input) self.critic_network.create_q_network(self.state_input, self.actor_network.action_output) # create target network self.actor_network.create_target_network(self.target_state_input) self.critic_network.create_target_q_network(self.target_state_input, self.actor_network.target_action_output) # create training method self.actor_network.create_training_method(self.critic_network.q_value_output) self.critic_network.create_training_method() self.sess.run(tf.initialize_all_variables()) self.actor_network.update_target() self.critic_network.update_target() self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.exploration_noise = OUNoise(self.action_dim) self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/models_ddpg' if not os.path.exists(self.dir_path): os.mkdir(self.dir_path) # for log self.reward_input = tf.placeholder(tf.float32) tf.scalar_summary('reward', self.reward_input) self.time_input = tf.placeholder(tf.float32) tf.scalar_summary('living_time', self.time_input) self.summary_op = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(self.dir_path + '/log', self.sess.graph) self.episode_reward = 0.0 self.episode_start_time = 0.0 self.time_step = 1 self.saver = tf.train.Saver(tf.all_variables()) self.load_time_step() self.load_network() return def train(self): action_dim = self.action_dim minibatch = self.replay_buffer.get_batch(BATCH_SIZE) # sample BATCH_SIZE from replay_buffer state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # if action_dim = 1, it's a number not a array action_batch = np.resize(action_batch, [BATCH_SIZE, action_dim]) # calculate y_batch via target network next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q_value(next_state_batch, next_action_batch) y_batch = [] for i in range(BATCH_SIZE): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # print np.shape(reward_batch), np.shape(y_batch) # train actor network self.actor_network.train(state_batch) # train critic network self.critic_network.train(y_batch, state_batch, action_batch) # update target network self.actor_network.update_target() self.critic_network.update_target() return def noise_action(self, state): action = self.actor_network.action(state) return action + self.exploration_noise.noise() def action(self, state): action = self.actor_network.action(state) return action def _record_log(self, reward, living_time): summary_str = self.sess.run(self.summary_op, feed_dict={ self.reward_input: reward, self.time_input: living_time }) self.summary_writer.add_summary(summary_str, self.time_step) return def perceive(self, state, action, reward, next_state, done): self.replay_buffer.add(state, action, reward, next_state, done) if self.episode_start_time == 0.0: self.episode_start_time = time.time() # for testing # self.time_step += 1 # if self.time_step == 100: # print '--------------------------------' # self.replay_buffer.save_to_pickle() # return self.episode_reward += reward living_time = time.time() - self.episode_start_time if self.time_step % 1000 == 0 or done: self._record_log(self.episode_reward, living_time) if self.replay_buffer.size() > REPLAY_START_SIZE: self.train() if self.time_step % 100000 == 0: self.save_network() if done: print '===============reset noise=========================' self.exploration_noise.reset() self.episode_reward = 0.0 self.episode_start_time = time.time() self.time_step += 1 return def load_time_step(self): if not os.path.exists(self.dir_path): return files = os.listdir(self.dir_path) step_list = [] for filename in files: if ('meta' in filename) or ('-' not in filename): continue step_list.append(int(filename.split('-')[-1])) step_list = sorted(step_list) if len(step_list) == 0: return self.time_step = step_list[-1] + 1 return def load_network(self): checkpoint = tf.train.get_checkpoint_state(self.dir_path) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print 'Successfully loaded:', checkpoint.model_checkpoint_path else: print 'Could not find old network weights' return def save_network(self): print 'save actor-critic network...', self.time_step self.saver.save(self.sess, self.dir_path + '/ddpg', global_step=self.time_step) return
def __init__(self): # Initialize our session self.session = tf.Session() self.graph = self.session.graph with self.graph.as_default(): # View the state batches self.visualize_input = VISUALIZE_BUFFER if self.visualize_input: self.viewer = CostmapVisualizer() # Hardcode input size and action size self.height = 86 self.width = self.height self.depth = 4 self.action_dim = 2 # Initialize the current action and the old action and old state for setting experiences self.old_state = np.zeros((self.width, self.height, self.depth), dtype='int8') self.old_action = np.ones(2, dtype='float') self.network_action = np.zeros(2, dtype='float') self.noise_action = np.zeros(2, dtype='float') self.action = np.zeros(2, dtype='float') # Initialize the grad inverter object to keep the action bounds self.action_bounds = [[0.3, 0.3], [-0.3, -0.3]] self.grad_inv = GradInverter(self.action_bounds) # Initialize summary writers to plot variables during training self.summary_op = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(os.path.expanduser('~')+'/tensorboard_data') # Initialize actor and critic networks self.actor_network = ActorNetwork(self.height, self.action_dim, self.depth, self.session, self.summary_writer) self.critic_network = CriticNetwork(self.height, self.action_dim, self.depth, self.session, self.summary_writer) # Initialize the saver to save the network params self.saver = tf.train.Saver() # initialize the experience data manger self.data_manager = DataManager(self.session.graph, self.session, BATCH_SIZE) # Should we load the pre-trained params? # If so: Load the full pre-trained net # Else: Initialize all variables the overwrite the conv layers with the pretrained filters if PRE_TRAINED_NETS: self.saver.restore(self.session, NET_LOAD_PATH) else: self.session.run(tf.initialize_all_variables()) self.critic_network.restore_pretrained_weights(FILTER_LOAD_PATH) self.actor_network.restore_pretrained_weights(FILTER_LOAD_PATH) threads = tf.train.start_queue_runners(sess=self.session) time.sleep(1) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim, MU, THETA, SIGMA) self.noise_flag = True # Initialize time step self.training_step = 0 # Flag: don't learn the first experience self.first_experience = True # After the graph has been filled add it to the summary writer self.summary_writer.add_graph(self.graph)
class DDPG(object): def __init__(self, env, args): self.action_dim = env.action_space.shape[0] self.state_dim = env.observation_space.shape[0] self.actor_lr = args.a_lr self.critic_lr = args.c_lr self.gamma = args.gamma # Ornstein-Uhlenbeck noise parameters self.ou = OUNoise( self.action_dim, theta=args.noise_theta, sigma=args.noise_sigma) self.replay_buffer = deque(maxlen=args.buffer_size) self.replay_start_size = args.replay_start_size self.batch_size = args.batch_size self.target_update_rate = args.target_update_rate self.total_parameters = 0 self.global_steps = 0 self.reg_param = args.reg_param def construct_model(self, gpu): if gpu == -1: # use CPU device = '/cpu:0' sess_config = tf.ConfigProto() else: # use GPU device = '/gpu:' + str(gpu) sess_config = tf.ConfigProto( log_device_placement=True, allow_soft_placement=True) sess_config.gpu_options.allow_growth = True self.sess = tf.Session(config=sess_config) with tf.device(device): # output action, q_value and gradients of q_val w.r.t. action with tf.name_scope('predict_actions'): self.states = tf.placeholder( tf.float32, [None, self.state_dim], name='states') self.action = tf.placeholder( tf.float32, [None, self.action_dim], name='action') self.is_training = tf.placeholder(tf.bool, name='is_training') self.action_outputs, self.actor_params = self._build_actor( self.states, scope='actor_net', bn=True) self.value_outputs, self.critic_params = self._build_critic( self.states, self.action, scope='critic_net', bn=False) self.action_gradients = tf.gradients( self.value_outputs, self.action)[0] # estimate target_q for update critic with tf.name_scope('estimate_target_q'): self.next_states = tf.placeholder( tf.float32, [None, self.state_dim], name='next_states') self.mask = tf.placeholder(tf.float32, [None], name='mask') self.rewards = tf.placeholder(tf.float32, [None], name='rewards') # target actor network self.t_action_outputs, self.t_actor_params = self._build_actor( self.next_states, scope='t_actor_net', bn=True, trainable=False) # target critic network self.t_value_outputs, self.t_critic_params = self._build_critic( self.next_states, self.t_action_outputs, bn=False, scope='t_critic_net', trainable=False) self.target_q = self.rewards + self.gamma * \ (self.t_value_outputs[:, 0] * self.mask) with tf.name_scope('compute_gradients'): self.actor_opt = tf.train.AdamOptimizer(self.actor_lr) self.critic_opt = tf.train.AdamOptimizer(self.critic_lr) # critic gradients td_error = self.target_q - self.value_outputs[:, 0] critic_mse = tf.reduce_mean(tf.square(td_error)) critic_reg = tf.reduce_sum( [tf.nn.l2_loss(v) for v in self.critic_params]) critic_loss = critic_mse + self.reg_param * critic_reg self.critic_gradients = \ self.critic_opt.compute_gradients( critic_loss, self.critic_params) # actor gradients self.q_action_grads = tf.placeholder( tf.float32, [None, self.action_dim], name='q_action_grads') actor_gradients = tf.gradients( self.action_outputs, self.actor_params, -self.q_action_grads) self.actor_gradients = zip(actor_gradients, self.actor_params) # apply gradient to update model self.train_actor = self.actor_opt.apply_gradients( self.actor_gradients) self.train_critic = self.critic_opt.apply_gradients( self.critic_gradients) with tf.name_scope('update_target_networks'): # batch norm paramerters should not be included when updating! target_networks_update = [] for v_source, v_target in zip( self.actor_params, self.t_actor_params): update_op = v_target.assign_sub( 0.001 * (v_target - v_source)) target_networks_update.append(update_op) for v_source, v_target in zip( self.critic_params, self.t_critic_params): update_op = v_target.assign_sub( 0.01 * (v_target - v_source)) target_networks_update.append(update_op) self.target_networks_update = tf.group(*target_networks_update) with tf.name_scope('total_numbers_of_parameters'): for v in tf.trainable_variables(): shape = v.get_shape() param_num = 1 for d in shape: param_num *= d.value print(v.name, ' ', shape, ' param nums: ', param_num) self.total_parameters += param_num print('Total nums of parameters: ', self.total_parameters) def sample_action(self, states, noise): # is_training suppose to be False when sampling action. action = self.sess.run( self.action_outputs, feed_dict={self.states: states, self.is_training: False}) ou_noise = self.ou.noise() if noise else 0 return action + ou_noise def store_experience(self, s, a, r, next_s, done): self.replay_buffer.append([s, a[0], r, next_s, done]) self.global_steps += 1 def update_model(self): if len(self.replay_buffer) < self.replay_start_size: return # get batch batch = random.sample(self.replay_buffer, self.batch_size) s, _a, r, next_s, done = np.vstack(batch).T.tolist() mask = ~np.array(done) # compute a = u(s) a = self.sess.run(self.action_outputs, { self.states: s, self.is_training: True }) # gradients of q_value w.r.t action a dq_da = self.sess.run(self.action_gradients, { self.states: s, self.action: a, self.is_training: True }) # train self.sess.run([self.train_actor, self.train_critic], { # train_actor feed self.states: s, self.is_training: True, self.q_action_grads: dq_da, # train_critic feed self.next_states: next_s, self.action: _a, self.mask: mask, self.rewards: r }) # update target network self.sess.run(self.target_networks_update) def _build_actor(self, states, scope, bn=False, trainable=True): h1_dim = 400 h2_dim = 300 init = tf.contrib.layers.variance_scaling_initializer( factor=1.0, mode='FAN_IN', uniform=True) with tf.variable_scope(scope): if bn: states = self.batch_norm( states, self.is_training, tf.identity, scope='actor_bn_states', trainable=trainable) h1 = tcl.fully_connected( states, h1_dim, activation_fn=None, weights_initializer=init, biases_initializer=init, trainable=trainable, scope='actor_h1') if bn: h1 = self.batch_norm( h1, self.is_training, tf.nn.relu, scope='actor_bn_h1', trainable=trainable) else: h1 = tf.nn.relu(h1) h2 = tcl.fully_connected( h1, h2_dim, activation_fn=None, weights_initializer=init, biases_initializer=init, trainable=trainable, scope='actor_h2') if bn: h2 = self.batch_norm( h2, self.is_training, tf.nn.relu, scope='actor_bn_h2', trainable=trainable) else: h2 = tf.nn.relu(h2) # use tanh to bound the action a = tcl.fully_connected( h2, self.action_dim, activation_fn=tf.nn.tanh, weights_initializer=tf.random_uniform_initializer(-3e-3, 3e-3), biases_initializer=tf.random_uniform_initializer(-3e-4, 3e-4), trainable=trainable, scope='actor_out') params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope) return a, params def _build_critic(self, states, action, scope, bn=False, trainable=True): h1_dim = 400 h2_dim = 300 init = tf.contrib.layers.variance_scaling_initializer( factor=1.0, mode='FAN_IN', uniform=True) with tf.variable_scope(scope): if bn: states = self.batch_norm( states, self.is_training, tf.identity, scope='critic_bn_state', trainable=trainable) h1 = tcl.fully_connected( states, h1_dim, activation_fn=None, weights_initializer=init, biases_initializer=init, trainable=trainable, scope='critic_h1') if bn: h1 = self.batch_norm( h1, self.is_training, tf.nn.relu, scope='critic_bn_h1', trainable=trainable) else: h1 = tf.nn.relu(h1) # skip action from the first layer h1 = tf.concat([h1, action], 1) h2 = tcl.fully_connected( h1, h2_dim, activation_fn=None, weights_initializer=init, biases_initializer=init, trainable=trainable, scope='critic_h2') if bn: h2 = self.batch_norm( h2, self.is_training, tf.nn.relu, scope='critic_bn_h2', trainable=trainable) else: h2 = tf.nn.relu(h2) q = tcl.fully_connected( h2, 1, activation_fn=None, weights_initializer=tf.random_uniform_initializer(-3e-3, 3e-3), biases_initializer=tf.random_uniform_initializer(-3e-4, 3e-4), trainable=trainable, scope='critic_out') params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope) return q, params def batch_norm(self, x, is_training, activation_fn, scope, trainable=True): # switch the 'is_training' flag and 'reuse' flag return tf.cond( is_training, lambda: tf.contrib.layers.batch_norm( x, activation_fn=activation_fn, center=True, scale=True, updates_collections=None, is_training=True, reuse=None, scope=scope, decay=0.9, epsilon=1e-5, trainable=trainable), lambda: tf.contrib.layers.batch_norm( x, activation_fn=activation_fn, center=True, scale=True, updates_collections=None, is_training=False, reuse=True, # to be able to reuse scope must be given scope=scope, decay=0.9, epsilon=1e-5, trainable=trainable))
def __init__(self): # Make sure all the directories exist if not tf.gfile.Exists(TFLOG_PATH): tf.gfile.MakeDirs(TFLOG_PATH) if not tf.gfile.Exists(EXPERIENCE_PATH): tf.gfile.MakeDirs(EXPERIENCE_PATH) if not tf.gfile.Exists(NET_SAVE_PATH): tf.gfile.MakeDirs(NET_SAVE_PATH) # Initialize our session self.session = tf.Session() self.graph = self.session.graph with self.graph.as_default(): # View the state batches self.visualize_input = VISUALIZE_BUFFER if self.visualize_input: self.viewer = CostmapVisualizer() # Hardcode input size and action size self.height = 86 self.width = self.height self.depth = 4 self.action_dim = 2 # Initialize the current action and the old action and old state for setting experiences self.old_state = np.zeros((self.width, self.height, self.depth), dtype='int8') self.old_action = np.ones(2, dtype='float') self.network_action = np.zeros(2, dtype='float') self.noise_action = np.zeros(2, dtype='float') self.action = np.zeros(2, dtype='float') # Initialize the grad inverter object to keep the action bounds self.grad_inv = GradInverter(A0_BOUNDS, A1_BOUNDS, self.session) # Make sure the directory for the data files exists if not tf.gfile.Exists(DATA_PATH): tf.gfile.MakeDirs(DATA_PATH) # Initialize summary writers to plot variables during training self.summary_op = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(TFLOG_PATH) # Initialize actor and critic networks self.actor_network = ActorNetwork(self.height, self.action_dim, self.depth, self.session, self.summary_writer) self.critic_network = CriticNetwork(self.height, self.action_dim, self.depth, self.session, self.summary_writer) # Initialize the saver to save the network params self.saver = tf.train.Saver() # initialize the experience data manger self.data_manager = DataManager(BATCH_SIZE, EXPERIENCE_PATH, self.session) # Uncomment if collecting a buffer for the autoencoder # self.buffer = deque() # Should we load the pre-trained params? # If so: Load the full pre-trained net # Else: Initialize all variables the overwrite the conv layers with the pretrained filters if PRE_TRAINED_NETS: self.saver.restore(self.session, NET_LOAD_PATH) else: self.session.run(tf.initialize_all_variables()) tf.train.start_queue_runners(sess=self.session) time.sleep(1) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim, MU, THETA, SIGMA) self.noise_flag = True # Initialize time step self.training_step = 0 # Flag: don't learn the first experience self.first_experience = True # After the graph has been filled add it to the summary writer self.summary_writer.add_graph(self.graph)
class DDPG: def __init__(self): # Make sure all the directories exist if not tf.gfile.Exists(TFLOG_PATH): tf.gfile.MakeDirs(TFLOG_PATH) if not tf.gfile.Exists(EXPERIENCE_PATH): tf.gfile.MakeDirs(EXPERIENCE_PATH) if not tf.gfile.Exists(NET_SAVE_PATH): tf.gfile.MakeDirs(NET_SAVE_PATH) # Initialize our session self.session = tf.Session() self.graph = self.session.graph with self.graph.as_default(): # View the state batches self.visualize_input = VISUALIZE_BUFFER if self.visualize_input: self.viewer = CostmapVisualizer() # Hardcode input size and action size self.height = 86 self.width = self.height self.depth = 4 self.action_dim = 2 # Initialize the current action and the old action and old state for setting experiences self.old_state = np.zeros((self.width, self.height, self.depth), dtype='int8') self.old_action = np.ones(2, dtype='float') self.network_action = np.zeros(2, dtype='float') self.noise_action = np.zeros(2, dtype='float') self.action = np.zeros(2, dtype='float') # Initialize the grad inverter object to keep the action bounds self.grad_inv = GradInverter(A0_BOUNDS, A1_BOUNDS, self.session) # Make sure the directory for the data files exists if not tf.gfile.Exists(DATA_PATH): tf.gfile.MakeDirs(DATA_PATH) # Initialize summary writers to plot variables during training self.summary_op = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(TFLOG_PATH) # Initialize actor and critic networks self.actor_network = ActorNetwork(self.height, self.action_dim, self.depth, self.session, self.summary_writer) self.critic_network = CriticNetwork(self.height, self.action_dim, self.depth, self.session, self.summary_writer) # Initialize the saver to save the network params self.saver = tf.train.Saver() # initialize the experience data manger self.data_manager = DataManager(BATCH_SIZE, EXPERIENCE_PATH, self.session) # Uncomment if collecting a buffer for the autoencoder # self.buffer = deque() # Should we load the pre-trained params? # If so: Load the full pre-trained net # Else: Initialize all variables the overwrite the conv layers with the pretrained filters if PRE_TRAINED_NETS: self.saver.restore(self.session, NET_LOAD_PATH) else: self.session.run(tf.initialize_all_variables()) tf.train.start_queue_runners(sess=self.session) time.sleep(1) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim, MU, THETA, SIGMA) self.noise_flag = True # Initialize time step self.training_step = 0 # Flag: don't learn the first experience self.first_experience = True # After the graph has been filled add it to the summary writer self.summary_writer.add_graph(self.graph) def train(self): # Check if the buffer is big enough to start training if self.data_manager.enough_data(): # get the next random batch from the data manger state_batch, \ action_batch, \ reward_batch, \ next_state_batch, \ is_episode_finished_batch = self.data_manager.get_next_batch() state_batch = np.divide(state_batch, 100.0) next_state_batch = np.divide(next_state_batch, 100.0) # Are we visualizing the first state batch for debugging? # If so: We have to scale up the values for grey scale before plotting if self.visualize_input: state_batch_np = np.asarray(state_batch) state_batch_np = np.multiply(state_batch_np, -100.0) state_batch_np = np.add(state_batch_np, 100.0) self.viewer.set_data(state_batch_np) self.viewer.run() self.visualize_input = False # Calculate y for the td_error of the critic y_batch = [] next_action_batch = self.actor_network.target_evaluate(next_state_batch) q_value_batch = self.critic_network.target_evaluate(next_state_batch, next_action_batch) for i in range(0, BATCH_SIZE): if is_episode_finished_batch[i]: y_batch.append([reward_batch[i]]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) # Now that we have the y batch lets train the critic self.critic_network.train(y_batch, state_batch, action_batch) # Get the action batch so we can calculate the action gradient with it # Then get the action gradient batch and adapt the gradient with the gradient inverting method action_batch_for_gradients = self.actor_network.evaluate(state_batch) q_gradient_batch = self.critic_network.get_action_gradient(state_batch, action_batch_for_gradients) q_gradient_batch = self.grad_inv.invert(q_gradient_batch, action_batch_for_gradients) # Now we can train the actor self.actor_network.train(q_gradient_batch, state_batch) # Save model if necessary if self.training_step > 0 and self.training_step % SAVE_STEP == 0: self.saver.save(self.session, NET_SAVE_PATH, global_step=self.training_step) # Update time step self.training_step += 1 self.data_manager.check_for_enqueue() def get_action(self, state): # normalize the state state = state.astype(float) state = np.divide(state, 100.0) # Get the action self.action = self.actor_network.get_action(state) # Are we using noise? if self.noise_flag: # scale noise down to 0 at training step 3000000 if self.training_step < MAX_NOISE_STEP: self.action += (MAX_NOISE_STEP - self.training_step) / MAX_NOISE_STEP * self.exploration_noise.noise() # if action value lies outside of action bounds, rescale the action vector if self.action[0] < A0_BOUNDS[0] or self.action[0] > A0_BOUNDS[1]: self.action *= np.fabs(A0_BOUNDS[0]/self.action[0]) if self.action[1] < A0_BOUNDS[0] or self.action[1] > A0_BOUNDS[1]: self.action *= np.fabs(A1_BOUNDS[0]/self.action[1]) # Life q value output for this action and state self.print_q_value(state, self.action) return self.action def set_experience(self, state, reward, is_episode_finished): # Make sure we're saving a new old_state for the first experience of every episode if self.first_experience: self.first_experience = False else: self.data_manager.store_experience_to_file(self.old_state, self.old_action, reward, state, is_episode_finished) # Uncomment if collecting data for the auto_encoder # experience = (self.old_state, self.old_action, reward, state, is_episode_finished) # self.buffer.append(experience) if is_episode_finished: self.first_experience = True self.exploration_noise.reset() # Safe old state and old action for next experience self.old_state = state self.old_action = self.action def print_q_value(self, state, action): string = "-" q_value = self.critic_network.evaluate([state], [action]) stroke_pos = 30 * q_value[0][0] + 30 if stroke_pos < 0: stroke_pos = 0 elif stroke_pos > 60: stroke_pos = 60 print '[' + stroke_pos * string + '|' + (60-stroke_pos) * string + ']', "Q: ", q_value[0][0], \ "\tt: ", self.training_step
def main(): experiment= 'InvertedPendulum-v1' env= gym.make(experiment) assert isinstance(env.observation_space, Box), "observation space must be continuous" assert isinstance(env.action_space, Box), "action space must be continuous" #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env) exploration_noise = OUNoise(env.action_space.shape[0]) counter=0 total_reward=0 num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] #saving reward: reward_st = np.array([0]) for i in xrange(episodes): observation = env.reset() reward_per_episode = 0 for t in xrange(steps): #rendering environmet (optional) #env.render() x = observation #select action using actor network model action = agent.evaluate_actor(np.reshape(x,[num_actions,num_states])) noise = exploration_noise.noise() action = action[0] + noise print 'Agent.Action :',action print '\n' print '\n' observation,reward,done,[]=env.step(action) #add s_t,s_t+1,action,reward to experience memeroy agent.add_experience(x,observation,action,reward,done) #train critic and actor network if counter > 64: agent.train() reward_per_episode+=reward counter+=1 #check if episode ends: if done: print 'EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode exploration_noise.reset() reward_st = np.append(reward_st,reward_per_episode) np.savetxt('episode_reward.txt',reward_st, newline="\n") print '\n' print '\n' break total_reward+=reward_per_episode print "Average reward per episode {}".format(total_reward / episodes)
class DDPG: """docstring for DDPG""" def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim) self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch,[BATCH_SIZE,self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch,next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else : y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch,[BATCH_SIZE,1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch,state_batch,action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients) self.actor_network.train(q_gradient_batch,state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self,state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action+self.exploration_noise.noise() def action(self,state): action = self.actor_network.action(state) return action def perceive(self,state,action,reward,next_state,done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state,action,reward,next_state,done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class DDPG: """docstring for DDPG""" def __init__(self, environment): self.name = 'DDPG' # name for uploading results self.environment = environment # Randomly initialize actor network and critic network # with both their target networks self.actor_network = ActorNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0]) self.critic_network = CriticNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0]) # initialize replay buffer self.replay_buffer = deque() # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(environment.action_space.shape[0]) # Initialize time step self.time_step = 0 def set_init_observation(self,observation): # receive initial observation state self.state = observation def train(self): # Sample a random minibatch of N transitions from replay buffer minibatch = random.sample(self.replay_buffer,BATCH_SIZE) state_batch = [data[0] for data in minibatch] action_batch = [data[1] for data in minibatch] reward_batch = [data[2] for data in minibatch] next_state_batch = [data[3] for data in minibatch] action_batch = np.resize(action_batch,[BATCH_SIZE,1]) # Calculate y y_batch = [] next_action_batch = self.actor_network.target_evaluate(next_state_batch) q_value_batch = self.critic_network.target_evaluate(next_state_batch,next_action_batch) for i in range(0,BATCH_SIZE): done = minibatch[i][4] if done: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) # Update critic by minimizing the loss L self.critic_network.train(y_batch,state_batch,action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.evaluate(state_batch) q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients)/BATCH_SIZE self.actor_network.train(q_gradient_batch,state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def get_action(self): # Select action a_t according to the current policy and exploration noise action = self.actor_network.get_action(self.state) return np.clip(action+self.exploration_noise.noise(),self.environment.action_space.low,self.environment.action_space.high) def set_feedback(self,observation,action,reward,done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer next_state = observation self.replay_buffer.append((self.state,action,reward,next_state,done)) # Update current state self.state = next_state # Update time step self.time_step += 1 # Limit the replay buffer size if len(self.replay_buffer) > REPLAY_BUFFER_SIZE: self.replay_buffer.popleft() # Store transitions to replay start size then start training if self.time_step > REPLAY_START_SIZE: self.train() if self.time_step % 10000 == 0: self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class DDPG: def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] # Initialize time step self.time_step = 0 # initialize replay buffer self.replay_buffer = deque() # initialize networks self.create_networks_and_training_method(state_dim,action_dim) self.sess = tf.InteractiveSession() self.sess.run(tf.initialize_all_variables()) # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state("saved_networks") if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print "Successfully loaded:", checkpoint.model_checkpoint_path else: print "Could not find old network weights" global summary_writer summary_writer = tf.train.SummaryWriter('~/logs',graph=self.sess.graph) def create_networks_and_training_method(self,state_dim,action_dim): theta_p = networks.theta_p(state_dim,action_dim) theta_q = networks.theta_q(state_dim,action_dim) target_theta_p,target_update_p = self.exponential_moving_averages(theta_p,TAU) target_theta_q,target_update_q = self.exponential_moving_averages(theta_q,TAU) self.state = tf.placeholder(tf.float32,[None,state_dim],'state') self.action_test = networks.policy_network(self.state,theta_p) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration = OUNoise(action_dim) noise = self.exploration.noise() self.action_exploration = self.action_test + noise q = networks.q_network(self.state,self.action_test,theta_q) # policy optimization mean_q = tf.reduce_mean(q) weight_decay_p = tf.add_n([L2_POLICY * tf.nn.l2_loss(var) for var in theta_p]) loss_p = -mean_q + weight_decay_p optim_p = tf.train.AdamOptimizer(P_LEARNING_RATE) grads_and_vars_p = optim_p.compute_gradients(loss_p, var_list=theta_p) optimize_p = optim_p.apply_gradients(grads_and_vars_p) with tf.control_dependencies([optimize_p]): self.train_p = tf.group(target_update_p) # q optimization self.action_train = tf.placeholder(tf.float32,[None,action_dim],'action_train') self.reward = tf.placeholder(tf.float32,[None],'reward') self.next_state = tf.placeholder(tf.float32,[None,state_dim],'next_state') self.done = tf.placeholder(tf.bool,[None],'done') q_train = networks.q_network(self.state,self.action_train,theta_q) next_action = networks.policy_network(self.next_state,theta=target_theta_p) next_q = networks.q_network(self.next_state,next_action,theta=target_theta_q) q_target = tf.stop_gradient(tf.select(self.done,self.reward,self.reward + GAMMA * next_q)) # q loss q_error = tf.reduce_mean(tf.square(q_target - q_train)) weight_decay_q = tf.add_n([L2_Q * tf.nn.l2_loss(var) for var in theta_q]) loss_q = q_error + weight_decay_q optim_q = tf.train.AdamOptimizer(Q_LEARNING_RATE) grads_and_vars_q = optim_q.compute_gradients(loss_q, var_list=theta_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) with tf.control_dependencies([optimize_q]): self.train_q = tf.group(target_update_q) tf.scalar_summary("loss_q",loss_q) tf.scalar_summary("loss_p",loss_p) tf.scalar_summary("q_mean",mean_q) global merged_summary_op merged_summary_op = tf.merge_all_summaries() def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = random.sample(self.replay_buffer,BATCH_SIZE) state_batch = [data[0] for data in minibatch] action_batch = [data[1] for data in minibatch] reward_batch = [data[2] for data in minibatch] next_state_batch = [data[3] for data in minibatch] done_batch = [data[4] for data in minibatch] _,_,summary_str = self.sess.run([self.train_p,self.train_q,merged_summary_op],feed_dict={ self.state:state_batch, self.action_train:action_batch, self.reward:reward_batch, self.next_state:next_state_batch, self.done:done_batch }) summary_writer.add_summary(summary_str,self.time_step) # save network every 1000 iteration if self.time_step % 1000 == 0: self.saver.save(self.sess, 'saved_networks/' + 'network' + '-ddpg', global_step = self.time_step) def noise_action(self,state): # Select action a_t according to the current policy and exploration noise action = self.sess.run(self.action_exploration,feed_dict={ self.state:[state] })[0] return np.clip(action,self.environment.action_space.low,self.environment.action_space.high) def action(self,state): action = self.sess.run(self.action_test,feed_dict={ self.state:[state] })[0] return np.clip(action,self.environment.action_space.low,self.environment.action_space.high) def perceive(self,state,action,reward,next_state,done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.append((state,action,reward,next_state,done)) # Update time step self.time_step += 1 # Limit the replay buffer size if len(self.replay_buffer) > REPLAY_BUFFER_SIZE: self.replay_buffer.popleft() # Store transitions to replay start size then start training if self.time_step > REPLAY_START_SIZE: self.train() # Re-iniitialize the random process when an episode ends if done: self.exploration.reset() # f fan-in size def exponential_moving_averages(self,theta, tau=0.001): ema = tf.train.ExponentialMovingAverage(decay=1 - tau) update = ema.apply(theta) # also creates shadow vars averages = [ema.average(x) for x in theta] return averages, update