def __init__(self): super(FirstAgent, self).__init__() # actor models self.actor_local = None self.actor_target = None # critic models self.critic_local = None self.critic_target = None # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters
def __init__(self, state_size, action_size, agent_id): """Initialize a DDPGAgent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action agent_id (int): identifier for this agent """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(RANDOM_SEED) self.agent_id = agent_id self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.critic_local = Critic(state_size, action_size).to(device) self.critic_target = Critic(state_size, action_size).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Make sure that the target-local model pairs are initialized to the # same weights self.hard_update(self.actor_local, self.actor_target) self.hard_update(self.critic_local, self.critic_target) self.noise = OUNoise(action_size)
def __init__(self, state_size, action_size, seed): self.gradient_clipping = True self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.config = Config() # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.config.LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.config.LR_CRITIC, weight_decay=self.config.WEIGHT_DECAY) self.noise = OUNoise(action_size, seed) self.memory = ReplayBuffer(action_size, self.config.BUFFER_SIZE, self.config.BATCH_SIZE, seed, device) self.step_count = 0
def __init__(self, env, replay_buffer, sample_batch, train_iter, gamma, tau, batch_size, n_train, n_episode): # Gym environment self.env = env env_flattened = gym.wrappers.FlattenDictWrapper( env, dict_keys=['observation', 'achieved_goal', 'desired_goal']) # Get space sizes self.state_dim = env_flattened.observation_space.shape[0] #self.state_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] # Get replay buffer and function get a batch from it self.replay_buffer = replay_buffer self.sample_batch = sample_batch self.sess = tf.InteractiveSession() # Hyper parameters self.gamma = gamma self.tau = tau self.batch_size = batch_size self.n_train = n_train self.n_episode = n_episode # Initialize networks self.critic = CriticNetwork(self.sess, self.state_dim, self.action_dim) self.actor = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.exploration_noise = OUNoise(self.action_dim)
def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks # self.state_dim = env.observation_space.shape[0] # self.action_dim = env.action_space.shape[0] self.state_dim = env.state_size self.action_dim = env.action_size self.action_bound = (env.action_high - env.action_low) / 2 print('state_dim: ', self.state_dim, 'action_dim: ', self.action_dim, 'action_bound: ', self.action_bound) self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim, self.action_bound) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim, self.action_bound) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim)
def __init__(self, shape_in, num_output, accele_range, angle_range): self.input_shape = shape_in self.out_shape = num_output self.learning_rate_a = LEARNING_RATE_ACTOR self.learning_rate_c = LEARNING_RATE_CRITIC self.memory = deque(maxlen=MAX_MEMORY_LEN) self.train_start = 200 self.batch_size = 64 self.gamma = 0.9 self.sigma_fixed = 2 self.channel = CHANNEL self.critic_input_action_shape = 1 self.angle_range = angle_range self.accele_range = accele_range self.actor_model = self.actor_net_builder() self.critic_model = self.critic_net_build() self.actor_target_model = self.actor_net_builder() self.critic_target_model = self.critic_net_build() self.OUnoise = OUNoise(2) # self.actor_target_model.trainable = False # self.critic_target_model.trainable = False self.actor_history = [] self.critic_history = [] self.reward_history = [] self.weight_hard_update()
def reset(self): # 매 episode가 시작될때 사용됨. # 사용 변수들 초기화 # 매 에피소드마다 로봇을 원점으로 이동시키려면 아래 주석을 해제한다. self.robot_tf = Transformation() self.joint1_tf = Transformation() self.link1_tf = Transformation(translation=(self.link1_len, 0)) self.joint2_tf = Transformation() self.link2_tf = Transformation(translation=(self.link2_len, 0)) self.link1_tf_global = self.robot_tf * self.joint1_tf * self.link1_tf self.link2_tf_global = self.link1_tf_global * self.joint2_tf * self.link2_tf self.step_count = 0.00 # 목표 지점 생성 self.target_tf = Transformation(translation=( random.randrange(-self.env_boundary, self.env_boundary), random.randrange(-self.env_boundary, self.env_boundary))) self.ou = OUNoise(dt=self.dt, theta=0.1, sigma=0.2) self.done = False self.t = 0 self.buffer = [] # 시각화를 위한 버퍼. episode가 리셋될 때마다 초기화. # Step 함수와 다르게 reset함수는 초기 state 값 만을 반환합니다. return self._get_state()
def __init__(self, env, time_steps, hidden_dim): self.name = 'DDPG' # name for uploading results self.scale = env.asset self.unit = env.unit self.seed = env.rd_seed self.time_dim = time_steps self.state_dim = env.observation_space.shape[1] self.action_dim = env.action_space.shape[0] self.batch_size = 64 self.memory_size = self.time_dim + self.batch_size * 10 self.start_size = self.time_dim + self.batch_size * 2 # Initialise actor & critic networks self.actor_network = Actor(self.time_dim, self.state_dim, self.action_dim, hidden_dim) self.critic_network = Critic(self.time_dim, self.state_dim, self.action_dim, hidden_dim) # Initialize replay buffer self.replay_state = torch.zeros( (self.start_size - 1, 3, self.state_dim), device=cuda) self.replay_next_state = torch.zeros( (self.start_size - 1, 3, self.state_dim), device=cuda) self.replay_action = torch.zeros( (self.start_size - 1, 1, self.state_dim), device=cuda) self.replay_reward = torch.zeros((self.start_size - 1, ), device=cuda) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim, sigma=0.01 / self.action_dim) self.initial()
def __init__(self, env, DIRECTORY): self.batch_size = BATCH_SIZE self.replay_start_size = REPLAY_START_SIZE # self.sub_batch_size = BATCH_SIZE / n_gpu self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) self.trace_length = TRACE_LENGTH self.temp_abstract = TEMP_ABSTRACT self.actor_network = ActorNetwork(self.sess, BATCH_SIZE, self.state_dim, self.action_dim, self.temp_abstract, DIRECTORY) self.critic_network = CriticNetwork(self.sess, BATCH_SIZE, self.state_dim, self.action_dim, self.temp_abstract, DIRECTORY) # initialize replay buffer max_len_trajectory = self.environment.spec.timestep_limit + 1 # trace_length self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE, DIRECTORY, max_len_trajectory, self.actor_network.last_epi) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) ### self.diff = 0. self.discounting_mat_dict = {}
def __init__(self, env): self.action_dim = env.action_space.shape[0] self.state_dim = env.observation_space.shape[0] self.h1_dim = 400 self.h2_dim = 300 self.actor_learning_rate = 1e-4 self.critic_learning_rate = 1e-3 self.gamma = 0.99 # Ornstein-Uhlenbeck noise parameters self.noise_theta = 0.15 self.noise_sigma = 0.20 self.ou = OUNoise(self.action_dim, theta=self.noise_theta, sigma=self.noise_sigma) self.replay_buffer_size = 1000000 self.replay_buffer = deque(maxlen=self.replay_buffer_size) self.replay_start_size = 1000 self.batch_size = 64 self.target_update_rate = 0.001 self.total_parameters = 0 self.global_steps = 0 self.reg_param = 0.01
def __init__(self, state_size, action_size, random_seed, hyperparams): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.hyperparams = hyperparams self.actor = Actor(state_size, action_size, random_seed).to(device) self.actor_noise = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optim = optim.Adam(self.actor.parameters(), lr=hyperparams.alpha_actor) self.critic = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optim = optim.Adam( self.critic.parameters(), lr=hyperparams.alpha_critic, weight_decay=hyperparams.weight_decay, ) self.replay_buffer = ReplayBuffer(hyperparams.buffer_size, hyperparams.batch_size, random_seed) self.noise = OUNoise( action_size, random_seed, self.hyperparams.mu, self.hyperparams.theta, self.hyperparams.sigma, )
def __init__(self, sess, number, model_path, global_episodes, explore, decay, training): self.name = 'worker_' + str(number) # name for uploading results self.number = number # Randomly initialize actor network and critic network # with both their target networks self.state_dim = 41 self.action_dim = 18 self.model_path = model_path self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.sess = sess self.explore = explore self.decay = decay self.training = training self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim, self.name + '/actor') self.actor_network.update_target(self.sess) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim, self.name + '/critic') self.critic_network.update_target(self.sess) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.update_local_ops_actor = update_target_graph( 'global/actor', self.name + '/actor') self.update_local_ops_critic = update_target_graph( 'global/critic', self.name + '/critic')
def __init__(self, env, state_dim=None): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # along with their target networks if state_dim: self.state_dim = state_dim print(self.state_dim) else: self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) # Flag to signal save self.not_saved = True #For normalisation self.state_mean = 0 self.state_std = 1 self.target_mean = 0 self.target_std = 1
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters
def __init__(self, state_item_num, action_item_num, emb_dim, batch_size, tau, actor_lr, critic_lr, gamma, buffer_size, item_space, summary_dir): self.state_item_num = state_item_num self.action_item_num = action_item_num self.emb_dim = emb_dim self.batch_size = batch_size self.tau = tau self.actor_lr = actor_lr self.critic_lr = critic_lr self.gamma = gamma self.buffer_size = buffer_size self.item_space = item_space self.summary_dir = summary_dir self.sess = tf.Session() self.s_dim = emb_dim * state_item_num self.a_dim = emb_dim * action_item_num self.actor = Actor(self.sess, state_item_num, action_item_num, emb_dim, batch_size, tau, actor_lr) self.critic = Critic(self.sess, state_item_num, action_item_num, emb_dim, self.actor.get_num_trainable_vars(), gamma, tau, critic_lr) self.exploration_noise = OUNoise(self.a_dim) # set up summary operators self.summary_ops, self.summary_vars = self.build_summaries() self.sess.run(tf.global_variables_initializer()) self.writer = tf.summary.FileWriter(summary_dir, self.sess.graph) # initialize target network weights self.actor.hard_update_target_network() self.critic.hard_update_target_network() # initialize replay memory self.replay_buffer = ReplayBuffer(buffer_size)
def __init__(self, state_size, action_size, seed, n_hidden_units=128, n_layers=3): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # actor self.actor = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_opt = optim.Adam(self.actor.parameters(), lr=1e-4) # critic self.critic = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_opt = optim.Adam(self.critic.parameters(), lr=3e-4, weight_decay=0.0001) # will add noise self.noise = OUNoise(action_size, seed) # experience replay self.replay = ReplayBuffer(seed)
def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] # self.state_dim = env.observation_space.shape[0] * 2 self.action_dim = env.action_space.shape[0] self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration # self.exploration_noise = OUNoise(self.action_dim) self.exploration_noise = OUNoise() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(MODEL_PATH) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) my_config.logger.warn("Successfully loaded: %s" % (checkpoint.model_checkpoint_path)) else: my_config.logger.error("Could not find old network weights")
def main(): experiment = 'model-builder-v0' #specify environments here env = gym.make(experiment) #steps= env.spec.timestep_limit #steps per episode steps = 20 assert isinstance(env.observation_space, Box), "observation space must be continuous" assert isinstance(env.action_space, Box), "action space must be continuous" #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env, is_batch_norm) exploration_noise = OUNoise(env.action_space.shape[0]) counter = 0 reward_per_episode = 0 total_reward = 0 num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] print("Number of States:", num_states) print("Number of Actions:", num_actions) print("Number of Steps per episode:", steps) #saving reward: reward_st = np.array([0]) for i in range(episodes): print("==== Starting episode no:", i, "====", "\n") observation = env.reset() reward_per_episode = 0 for t in range(steps): #rendering environmet (optional) env.render() x = observation action = agent.evaluate_actor(np.reshape(x, [1, 300, 300, 2])) noise = exploration_noise.noise() action = action[ 0] + noise #Select action according to current policy and exploration noise print("Action at step", t, " :", action, "\n") observation, reward, done, info = env.step(action) #add s_t,s_t+1,action,reward to experience memory agent.add_experience(x, observation, action, reward, done) #train critic and actor network if counter > 64: agent.train() reward_per_episode += reward counter += 1 #check if episode ends: if (done or (t == steps - 1)): print('EPISODE: ', i, ' Steps: ', t, ' Total Reward: ', reward_per_episode) print("Printing reward to file") exploration_noise.reset( ) #reinitializing random noise for action exploration reward_st = np.append(reward_st, reward_per_episode) np.savetxt('episode_reward.txt', reward_st, newline="\n") print('\n\n') break total_reward += reward_per_episode print("Average reward per episode {}".format(total_reward / episodes))
def learn(self, total_timesteps, callback): ou_scale = 1.0 # initial scaling factor ou_decay = 0.9995 # decay of the scaling factor ou_scale ou_mu = 0.0 # asymptotic mean of the noise ou_theta = 0.15 # magnitude of the drift term ou_sigma = 0.20 # magnitude of the diffusion term # this slowly decreases to 0 # create the noise process noise_process = OUNoise(self.action_size, ou_mu, ou_theta, ou_sigma) # create the replay buffer buffer = ReplayBuffer(seed=self.seed, action_size=self.action_size, buffer_size=self.buffer_size, batch_size=self.batch_size, device=self.device) self.t_step = 0 episode = 0 while self.t_step < total_timesteps: callback.on_start_episode(episode) episode_scores = np.zeros(self.env.num_agents) states, _, _ = self.env.reset() scores = np.zeros(2) while True: states = np.reshape(states, (1, 48)) # reshape so we can feed both agents states to each agent # split into the states into the parts observed by each agent states_0 = states[0, :24].reshape((1, 24)) states_1 = states[0, 24:].reshape((1, 24)) # generate noise noise = ou_scale * noise_process.get_noise().reshape((1, 4)) # split the noise into the parts for each agent noise_0 = noise[0, :2].reshape((1, 2)) noise_1 = noise[0, 2:].reshape((1, 2)) # determine actions for the unity agents from current sate, using noise for exploration actions_0 = self.player_policy.act(states_0, use_target=False, add_noise=True, noise_value=noise_0)\ .detach().cpu().numpy() actions_1 = self.opponent_policy.act(states_1, use_target=False, add_noise=True, noise_value=noise_1)\ .detach().cpu().numpy() actions = np.vstack((actions_0.flatten(), actions_1.flatten())) # take the action in the environment next_states, rewards, dones, info = self.env.step(actions) # store (S, A, R, S') info in the replay buffer (memory) buffer.add(states.flatten(), actions.flatten(), rewards, next_states.flatten(), dones) episode_scores += rewards states = next_states self.t_step += 1 """ Policy learning """ ## train the agents if we have enough replays in the buffer if len(buffer) >= self.batch_size: self.player_policy.learn(buffer.sample(), self.opponent_policy) self.opponent_policy.learn(buffer.sample(), self.player_policy) if np.any(dones): break if not callback.on_step(np.max(episode_scores), self.t_step): break # decrease the scaling factor of the noise ou_scale *= ou_decay episode += 1
def __init__(self, model, env, sess, num_episodes, direction): self.model = model self.sess = sess self.direction = direction self.env = env self.num_episodes = num_episodes self.episode_start = 0 self.noise = OUNoise(mu=np.zeros(self.env.action_space.shape)) self.noise_decay = 0.2 self.epsilon = EPSILON self.epsilon_decay = nth_root(self.num_episodes, 0.001 / self.epsilon) self.count_exp_replay = 0 self.tau = TAU self.target_Q_ph = tf.placeholder(tf.float32, shape=(None, 1)) self.actions_grads_ph = tf.placeholder( tf.float32, shape=((None, ) + self.env.action_space.shape)) #train operation self.actor_train_ops = self.model.Actor.train_step( self.actions_grads_ph) self.critic_train_ops = self.model.Critic.train_step(self.target_Q_ph) #update operation self.update_critic_target = self.model.update_target_network( self.model.Critic.network_params, self.model.Critic_target.network_params, self.tau) self.update_actor_target = self.model.update_target_network( self.model.Actor.network_params, self.model.Actor_target.network_params, self.tau) sess.run(tf.initialize_all_variables()) #for testing only self.sess.run( self.model.update_target_network( self.model.Critic.network_params, self.model.Critic_target.network_params)) self.sess.run( self.model.update_target_network( self.model.Actor.network_params, self.model.Actor_target.network_params)) # reward summary for tensorboard self.tf_reward = tf.Variable(0.0, trainable=False, name='reward_summary') self.tf_reward_summary = tf.summary.scalar("Reward by episode", self.tf_reward) # time self.tf_time = tf.Variable(0.0, trainable=False, name='Time_per_episode') self.tf_time_summary = tf.summary.scalar("Time per episode", self.tf_time) # writer self.writer = tf.summary.FileWriter('./graphs', self.sess.graph)
def __init__(self, env, state_dim, action_dim): self.name = 'DDPG' self.environment = env self.time_step = 0 self.state_dim = state_dim self.action_dim = action_dim self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.linear_noise = OUNoise(1, 0.5, 0.3, 0.6) self.angular_noise = OUNoise(1, 0, 0.6, 0.8)
def main(): #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env, is_batch_norm, CA_OBS_SPACE, CA_ACTION_SPACE, CA_ACTION_BOUND) exploration_noise = OUNoise(CA_ACTION_SPACE) counter = 0 reward_per_episode = 0 total_reward = 0 num_states = CA_OBS_SPACE num_actions = CA_ACTION_SPACE print "Number of States:", num_states print "Number of Actions:", num_actions print "Number of Steps per episode:", steps #saving reward: reward_st = np.array([0]) for i in xrange(episodes): print "==== Starting episode no:", i, "====", "\n" # observation = env.reset() observation = ca_reset() reward_per_episode = 0 for t in xrange(steps): #rendering environmet (optional) # env.render() x = observation action = agent.evaluate_actor(np.reshape(x, [1, num_states])) noise = exploration_noise.noise() action = action[ 0] + noise #Select action according to current policy and exploration noise print "Action at step", t, " :", action, "\n" # observation,reward,done,info=env.step(action) observation, reward, done, info = ca_step(action) print x, observation, action, reward, done #add s_t,s_t+1,action,reward to experience memory agent.add_experience(x, observation, action, reward, done) #train critic and actor network if counter > 64: agent.train() reward_per_episode += reward counter += 1 #check if episode ends: if (done or (t == steps - 1)): print 'EPISODE: ', i, ' Steps: ', t, ' Total Reward: ', reward_per_episode print "Printing reward to file" exploration_noise.reset( ) #reinitializing random noise for action exploration reward_st = np.append(reward_st, reward_per_episode) np.savetxt('episode_reward.txt', reward_st, newline="\n") print '\n\n' break total_reward += reward_per_episode print "Average reward per episode {}".format(total_reward / episodes)
def __init__(self, env, state_size, action_size): self.env = env self.replay_memory = deque() self.actor_network = actor_network.ActorNetwork( state_size, action_size) self.critic_network = critic_network.CriticNetwork( state_size, action_size) self.ou_noise = OUNoise(action_size) self.time_step = 0
def __init__(self, env, args): self.direction = args.direction self.env = env self.num_episodes = args.episodes self.episode_start = 0 self.noise = OUNoise(mu=np.zeros(self.env.action_space.shape)) self.noise_decay = args.noise_decay self.count_exp_replay = 0 self.train_iteration = 0 self.tau = args.TAU self.tools = Tools()
def __init__(self, task, train=True): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Set the learning rate suggested by paper: https://pdfs.semanticscholar.org/71f2/03de1a53deae81a7707143f0ed564661e279.pdf self.actor_learning_rate = 0.001 self.actor_decay = 0.0 self.critic_learning_rate = 0.001 self.critic_decay = 0.0 # Actor Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate, self.actor_decay) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate, self.actor_decay) # Critic Model self.critic_local = Critic(self.state_size, self.action_size, self.critic_learning_rate, self.critic_decay) self.critic_target = Critic(self.state_size, self.action_size, self.critic_learning_rate, self.critic_decay) # initialize targets model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 # self.exploration_theta = 0.15 # self.exploration_sigma = 0.2 self.exploration_theta = 0.01 self.exploration_sigma = 0.02 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.best_w = None self.best_score = -np.inf # self.noise_scale = 0.7 self.score = 0 # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # Indicate if we want to learn (or use to predict without learn) self.set_train(train)
def main(): env = Env(19997) steps= 10000 num_states = 59 num_actions = 3 #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env, is_batch_norm) exploration_noise = OUNoise(num_actions) counter=0 reward_per_episode = 0 total_reward=0 reward_st = np.array([0]) agent.actor_net.load_actor(os.getcwd() + '/weights/actor/model.ckpt') agent.critic_net.load_critic(os.getcwd() + '/weights/critic/model.ckpt') for i in range(episodes): # print "==== Starting episode no:",i,"====","\n" observation = env.reset() done =False reward_per_episode = 0 for t in range(steps): x = observation action = agent.evaluate_actor(np.reshape(x,[1,num_states])) noise = exploration_noise.noise() action = action[0] + noise #Select action according to current policy and exploration noise for i in range(num_actions): if action[i] > 1.0: action[i] = 1.0 if action[i] < -1.0: action[i] = -1.0 observation,reward,done = env.step(action) print("reward:", reward, "\n") agent.add_experience(x,observation,action,reward,done) #train critic and actor network if counter > 64: agent.train() reward_per_episode+=reward counter+=1 #check if episode ends: if (done or (t == steps-1)): print('Episode',i,'Steps: ',t,'Episode Reward:',reward_per_episode) exploration_noise.reset() reward_st = np.append(reward_st,reward_per_episode) np.savetxt('episode_reward.txt', reward_st, newline="\n") agent.actor_net.save_actor(os.getcwd() + '/weights/actor/model.ckpt') agent.critic_net.save_critic(os.getcwd() + '/weights/critic/model.ckpt') break total_reward+=reward_per_episode
def __init__(self, sess, data_fname, replay=False): self.name = 'DDPG' # Randomly initialize actor network and critic network # with both their target networks self.name = 'DDPG' # name for uploading results # Randomly initialize actor network and critic network # with both their target networks self.state_dim = Hp.state_dim self.action_dim = Hp.action_dim print(self.state_dim, self.action_dim) self.sess = sess self.state_input = [ tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord)) for _ in xrange(Hp.categories) ] #tf.placeholder("float",[None,self.state_dim]) self.target_state_input = [ tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord)) for _ in xrange(Hp.categories) ] #tf.placeholder("float",[None,self.state_dim]) self.state_network = StateEnc(self.sess, self.state_input, self.target_state_input) state_batch = self.state_network.encoding next_state_batch = self.state_network.target_encoding weights, biases, w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 = self.state_network.get_parameters( ) state_network_params = weights + biases + [ w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 ] self.actor_network = ActorNetwork(self.sess, Hp.n_hidden, self.action_dim, self.state_input, state_batch, next_state_batch, state_network_params) self.critic_network = CriticNetwork(self.sess, Hp.n_hidden, self.action_dim, state_batch, next_state_batch) # initialize replay buffer if replay: self.replay_buffer = ReplayBuffer(Hp.REPLAY_BUFFER_SIZE, data_fname) self.summary_str2 = None # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim)
def __init__(self, env): self.sess = tf.InteractiveSession() #self.params = loadparams() # ??? self.env = env self.n_states = env.observation_space.shape[0] self.n_actions = env.action_space.shape[0] self.low = self.env.action_space.low self.high = self.env.action_space.high self.actor_network = ActorNetwork(self.sess, self.n_states, self.n_actions) self.trainable_var_count = self.actor_network.get_trainable_var_count() self.critic_network = CriticNetwork(self.sess, self.n_states, self.n_actions, \ self.actor_network, self.trainable_var_count) self.replay_buffer = ReplayBuffer(BUFFER_SIZE) #params['buffer_size']??? self.exploration_noise = OUNoise(self.n_actions) # self.noise = Noise() self.gamma = GAMMA self.sess.run(tf.global_variables_initializer())
def __init__(self, env,device): self.name = 'DDPG' # name for uploading results self.environment = env self.device=device # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.actor_network = ActorNetwork(self.state_dim,self.action_dim) self.critic_network = CriticNetwork(self.state_dim,self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim)
def __init__(self, state_space, action_dim): self.name = 'DDPG' # name for uploading results self.sess = tf.Session() # Randomly initialize actor network and critic network # with both their target networks self.state_space = state_space self.action_dim = action_dim # 1 self.ac_network = ActorCriticNetwork(self.sess, self.state_space, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim)