class DDPG: """docstring for DDPG""" def __init__(self, sess, env, par_idx): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.par_idx = par_idx self.sess = sess with tf.variable_scope("particle_" + str(par_idx)): self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim, self.par_idx) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) #self.actor_network.train(q_gradient_batch,state_batch) self.actor_network.save_gradient(q_gradient_batch, state_batch) def update_target(self): # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action + self.exploration_noise.noise() def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset() def save_to_buffer(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) if done: self.exploration_noise.reset() def can_train(self): if self.replay_buffer.count() > REPLAY_START_SIZE: return True else: return False
class DDPG: """docstring for DDPG""" def __init__(self, sess, data_fname): self.name = 'DDPG' # Randomly initialize actor network and critic network # with both their target networks self.name = 'DDPG' # name for uploading results # Randomly initialize actor network and critic network # with both their target networks self.state_dim = Hp.state_dim self.action_dim = Hp.action_dim print(self.state_dim, self.action_dim) self.sess = sess self.state_input = [ tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord)) for _ in xrange(Hp.categories) ] #tf.placeholder("float",[None,self.state_dim]) self.target_state_input = [ tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord)) for _ in xrange(Hp.categories) ] #tf.placeholder("float",[None,self.state_dim]) self.state_network = StateEnc(self.sess, self.state_input, self.target_state_input) state_batch = self.state_network.encoding next_state_batch = self.state_network.target_encoding weights, biases, w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 = self.state_network.get_parameters( ) state_network_params = weights + biases + [ w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 ] self.actor_network = ActorNetwork(self.sess, Hp.n_hidden, self.action_dim, self.state_input, state_batch, next_state_batch, state_network_params) self.critic_network = CriticNetwork(self.sess, Hp.n_hidden, self.action_dim, state_batch, next_state_batch) # initialize replay buffer self.replay_buffer = ReplayBuffer(Hp.REPLAY_BUFFER_SIZE, data_fname) self.summary_str2 = None # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatches = self.replay_buffer.get_batch(Hp.batch_size * Hp.N_TRAIN) print("######### TRAINING #############") for k in range(Hp.N_TRAIN): minibatch = minibatches[k * Hp.batch_size:(k + 1) * Hp.batch_size] state_batch_r = np.asarray([data[0] for data in minibatch]) state_batch = [] for j in range(Hp.categories): new_cat = np.stack(state_batch_r[:, j], axis=0) state_batch.append(new_cat) #state_batch = [np.expand_dims(state_batch, axis=1)] action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch_r = np.asarray([data[3] for data in minibatch]) next_state_batch = [] for j in range(Hp.categories): new_cat = np.stack(next_state_batch_r[:, j], axis=0) next_state_batch.append(new_cat) #next_state_batch = [np.expand_dims(next_state_batch, axis=1)] done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [Hp.batch_size, self.action_dim]) next_action_batch = self.actor_network.target_actions( self.target_state_input, next_state_batch) q_value_batch = self.critic_network.target_q( self.target_state_input, next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + Hp.GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [Hp.batch_size, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, self.state_input, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions( self.state_input, state_batch) q_gradient_batch = self.critic_network.gradients( self.state_input, state_batch, action_batch_for_gradients) self.summary_str2 = self.actor_network.train( q_gradient_batch, self.state_input, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() self.state_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise state = [np.expand_dims(el, axis=0) for el in state] action = self.actor_network.action(state) print("no noise ", action) return np.clip( action + self.exploration_noise.noise() * np.array([-17.0, 17.0, 900.0]), [-35.0, 0.0, 0.0], [0.0, 35.0, 2000.0]) def action(self, state): state = [np.expand_dims(el, axis=0) for el in state] action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > Hp.REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class DDPG: """docstring for DDPG""" def __init__(self, state_dim, action_dim, env): self.name = 'DDPG' # name for uploading results self.time_step = 0 self.state_dim = state_dim self.action_dim = action_dim self.environment = env self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def train(self): minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action + self.exploration_noise.noise() def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) if self.replay_buffer.count() == REPLAY_START_SIZE: print('\n---------------Start training---------------') # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.time_step += 1 self.train() if self.time_step % 10000 == 0 and self.time_step > 0: self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset() return self.time_step
class DDPG: def __init__(self, env, state_dim, action_dim): self.name = 'DDPG' self.environment = env self.time_step = 0 self.state_dim = state_dim self.action_dim = action_dim self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) def train(self): minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) if self.replay_buffer.count() == REPLAY_START_SIZE: print('\n---------------Start training---------------') # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.time_step += 1 self.train() if self.time_step % 10000 == 0 and self.time_step > 0: self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) return self.time_step
class DDPG: def __init__(self, env): self.name = 'DDPG' self.environment = env self.episode = 0 self.epsilon = 0.98 self.one_number = 1 self.mean = [] self.state_dim = len(obs2state(env.reset().observation)) self.action_dim = env.action_spec().shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.exploration_noise = OUNoise(self.action_dim) def train(self): minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) self.critic_network.train(y_batch, state_batch, action_batch) action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): action = self.actor_network.action(state) exp = self.exploration_noise.noise() t = action * exp return exp def action(self, state): if np.random.rand() <= self.epsilon: act = self.noise_action(state) z = array(act) else: action = self.actor_network.action(state) z = array(action) self.mean.append(z[0]) g = np.tanh(z) return g def perceive(self, state, action, reward, next_state, done): self.replay_buffer.add(state, action, reward, next_state, done) if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() if self.epsilon > 0.1: self.epsilon *= 0.99999 if done: self.exploration_noise.reset()
class DDPG: """docstring for DDPG""" def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim) self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch,[BATCH_SIZE,self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch,next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else : y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch,[BATCH_SIZE,1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch,state_batch,action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients) self.actor_network.train(q_gradient_batch,state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self,state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action+self.exploration_noise.noise() def action(self,state): action = self.actor_network.action(state) return action def perceive(self,state,action,reward,next_state,done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state,action,reward,next_state,done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class DDPG: """docstring for DDPG""" def __init__(self, state_dim, action_dim): """name for uploading resuults""" self.name = 'DDPG' self.time_step = 0 # self.atten_rate = 1 """Randomly initialize actor network and critic network""" """and both their target networks""" self.state_dim = state_dim self.action_dim = action_dim self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) """initialize replay buffer""" self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) """Initialize a random process the Ornstein-Uhlenbeck process for action exploration""" self.exploration_noise = OUNoise(self.action_dim) """Initialize a Treading""" self.threading = threading.Thread(target=self.train, name='LoopThread--DDPG') def train(self): # if self.time_step ==0: # print("Begins Training!!!") #print("Training Begins") self.time_step += 1 """Sample a random minibatch of N transitions from replay buffer""" """take out BATCH_SIZE sets of data""" minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) """resize the action_batch shape to [BATCH_SIZE, self.action_dim]""" action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) """Calculate y_batch(reward)""" next_action_batch = self.actor_network.target_action(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) """Update critic by minimizing the loss L (training)""" self.critic_network.train(y_batch, state_batch, action_batch) """Update the actor policy using the sampled gradient:""" action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) """Update the target networks""" self.actor_network.update_target() self.critic_network.update_target() #print("Training Finished") def noise_action(self, state): """Select action a_t according to the current policy and exploration noise""" action = self.actor_network.action(state) exp_noise = self.exploration_noise.noise() action += exp_noise # action[0] = np.clip(action[0], 0, 1) # action[1] = np.clip(action[1], -1, 1) return action def action(self, state): action = self.actor_network.action(state) # action[0] = np.clip(action[0], 0, 1) # action[1] = np.clip(action[1], -1, 1) return action def perceive(self, state, action, reward, next_state, done): """Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer""" self.replay_buffer.add(state, action, reward, next_state, done) """Store transitions to replay start size then start training""" # if self.replay_buffer.count() % 1000 == 0: # print("The buffer count is ", self.replay_buffer.count()) if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() # self.atten_rate *= 0.99995 if not self.threading.is_alive(): self.threading = threading.Thread(target=self.train, name='LoopThread--DDPG') self.threading.start() """SAVE NETWORK""" if self.time_step % 100 == 0: print("Training_time_step:", self.time_step) if self.time_step % 1000 == 0: print("!!!!!!!save model success!!!!!!!!") self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) """Re-iniitialize the random process when an episode ends""" if done: self.exploration_noise.reset()
class DDPG: """docstring for DDPG""" def __init__(self, a_dim, s_dim): self.name = 'DDPG' # name for uploading results # self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = s_dim self.action_dim = a_dim self.time_step=0 self.max_bw = 0.0 self.max_cwnd = 0.0 self.min_rtt = 9999999.0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def learn(self): # print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients(state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): self.time_step += 1 # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) noise = self.exploration_noise.noise() # print("noise:" + str(noise)) return action + noise def choose_action(self, state): self.time_step += 1 # print("_______________________choose_action_____________________") action = self.actor_network.action(state) return action def store_transition(self, s, a, r, s_,done,episode_count): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer # print("*********************************ADD****************************") self.replay_buffer.add(s, a, r, s_, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: if((episode_count+1)%100!= 0): self.learn() # print("learn!") else: self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset() def extract_observation(self,dataRecorder,subflow_index,state_before): # print("extracting...") value_dic = dataRecorder.get_latest_data() state_after=state_before.reshape(10,5) # observation = np.zeros((4)) observation = np.zeros((5)) t_cWnd=[0,0] t_thr=[0,0] t_rtt=[0,0] t_loss_rate=[0,0] t_unAck=[0,0] s0=[0,0,0,0,0] state=np.zeros(1) for i in range(value_dic["nbOfSubflows"]): name = "cWnd" + str(i) t_cWnd[i] = value_dic[name] name = "rtt"+str(i) t_rtt[i] = value_dic[name] name = "unAck" + str(i) t_unAck[i]=value_dic[name] name = "loss_rate" + str(i) t_loss_rate[i]=value_dic[name] name = "throughput" + str(i) t_thr[i]=value_dic[name] thr=t_thr[subflow_index] s0[0]=t_thr[subflow_index] rtt=t_rtt[subflow_index] s0[1]=t_rtt[subflow_index] cwnd=t_cWnd[subflow_index] s0[2]=t_cWnd[subflow_index] loss_rate=t_loss_rate[subflow_index] s0[3]=t_loss_rate[subflow_index] unAck=t_unAck[subflow_index] s0[4]=t_unAck[subflow_index] s0=np.array(s0) min_=s0-s0 thr_n=s0[0] thr_n_min=s0[0]-min_[0] rtt_min=s0[1]-min_[1] cwnd_n_min=s0[2]-min_[2] loss_rate_n_min=s0[3]-min_[3] unAck_n_min=s0[4]-min_[4] # loss_rate_n_min=s0[7]-min_[7] if self.max_bw<thr_n_min: self.max_bw=thr_n_min if self.max_cwnd<cwnd_n_min: self.max_cwnd=cwnd_n_min if self.max_cwnd<cwnd_n_min: self.max_cwnd=cwnd_n_min if self.min_rtt>rtt_min: self.min_rtt=rtt_min reward = thr_n_min-5*(rtt_min-self.min_rtt)-10*loss_rate_n_min print("reward:"+str(reward)+" thr_n_min:"+str(thr_n_min)+ " rtt_min:"+str(rtt_min)+" self.min_rtt :"+str(self.min_rtt)+" delta_rtt"+str(rtt_min-self.min_rtt)) # print("unAck:"+str(unAck_n_min)) if self.max_bw!=0: state[0]=thr_n_min/self.max_bw # tmp=pacing_rate_n_min/self.max_bw state=np.append(state,[5*loss_rate_n_min]) state=np.append(state,[unAck_n_min]) else: state[0]=0 state=np.append(state,[0]) state=np.append(state,[0]) state=np.append(state,[1400/cwnd]) state=np.append(state,[self.min_rtt/rtt_min]) state_after=np.delete(state_after,[0],axis = 0) state_after=np.append(state_after,state) return state_after,reward,thr_n_min,rtt_min