示例#1
0
def main():
    register(
        id='Obstacle-v0',
        entry_point=ObstacleEnv,
        max_episode_steps=1000,
        reward_threshold=100.0,
    )
    env = gym.make('Obstacle-v0')

    sess = tf.InteractiveSession()
    agent = ActorNetwork(sess, 14, 2)

    rate = rospy.Rate(10.0)

    for episode in xrange(EPISODES):
        state = env.reset()
        # state_p = state[0][:4]
        while True:
            action = agent.action(state[0])
            next_state, reward, done, info = env.step(action)
            # next_state_p = next_state[0][:4]

            # with open("/home/ld/test_csv/state.csv", "a") as csvFile:
            #     writer = csv.writer(csvFile)
            #     writer.writerow(next_state[0])
            # with open("/home/ld/test_csv/vel.csv", "a") as csvFile:
            #     writer = csv.writer(csvFile)
            #     writer.writerow(action)
            # state_p = next_state_p
            state = next_state
            if done:
                break
            rate.sleep()
示例#2
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, sess, env, par_idx):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.par_idx = par_idx
        self.sess = sess

        with tf.variable_scope("particle_" + str(par_idx)):
            self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                              self.action_dim, self.par_idx)
            self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                                self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        #self.actor_network.train(q_gradient_batch,state_batch)
        self.actor_network.save_gradient(q_gradient_batch, state_batch)

    def update_target(self):
        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def save_to_buffer(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        if done:
            self.exploration_noise.reset()

    def can_train(self):
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            return True
        else:
            return False
示例#3
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, state_dim, action_dim, env):
        self.name = 'DDPG'  # name for uploading results
        self.time_step = 0
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.environment = env
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)
        if self.replay_buffer.count() == REPLAY_START_SIZE:
            print('\n---------------Start training---------------')
        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.time_step += 1
            self.train()

        if self.time_step % 10000 == 0 and self.time_step > 0:
            self.actor_network.save_network(self.time_step)
            self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

        return self.time_step
示例#4
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, sess, data_fname):
        self.name = 'DDPG'
        # Randomly initialize actor network and critic network
        # with both their target networks

        self.name = 'DDPG'  # name for uploading results
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = Hp.state_dim
        self.action_dim = Hp.action_dim
        print(self.state_dim, self.action_dim)

        self.sess = sess

        self.state_input = [
            tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord))
            for _ in xrange(Hp.categories)
        ]
        #tf.placeholder("float",[None,self.state_dim])
        self.target_state_input = [
            tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord))
            for _ in xrange(Hp.categories)
        ]
        #tf.placeholder("float",[None,self.state_dim])
        self.state_network = StateEnc(self.sess, self.state_input,
                                      self.target_state_input)
        state_batch = self.state_network.encoding
        next_state_batch = self.state_network.target_encoding

        weights, biases, w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 = self.state_network.get_parameters(
        )

        state_network_params = weights + biases + [
            w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2
        ]

        self.actor_network = ActorNetwork(self.sess, Hp.n_hidden,
                                          self.action_dim, self.state_input,
                                          state_batch, next_state_batch,
                                          state_network_params)
        self.critic_network = CriticNetwork(self.sess, Hp.n_hidden,
                                            self.action_dim, state_batch,
                                            next_state_batch)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(Hp.REPLAY_BUFFER_SIZE, data_fname)
        self.summary_str2 = None

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatches = self.replay_buffer.get_batch(Hp.batch_size * Hp.N_TRAIN)
        print("######### TRAINING   #############")
        for k in range(Hp.N_TRAIN):
            minibatch = minibatches[k * Hp.batch_size:(k + 1) * Hp.batch_size]
            state_batch_r = np.asarray([data[0] for data in minibatch])
            state_batch = []
            for j in range(Hp.categories):
                new_cat = np.stack(state_batch_r[:, j], axis=0)
                state_batch.append(new_cat)
            #state_batch = [np.expand_dims(state_batch, axis=1)]
            action_batch = np.asarray([data[1] for data in minibatch])
            reward_batch = np.asarray([data[2] for data in minibatch])
            next_state_batch_r = np.asarray([data[3] for data in minibatch])
            next_state_batch = []
            for j in range(Hp.categories):
                new_cat = np.stack(next_state_batch_r[:, j], axis=0)
                next_state_batch.append(new_cat)
            #next_state_batch = [np.expand_dims(next_state_batch, axis=1)]
            done_batch = np.asarray([data[4] for data in minibatch])

            # for action_dim = 1
            action_batch = np.resize(action_batch,
                                     [Hp.batch_size, self.action_dim])

            next_action_batch = self.actor_network.target_actions(
                self.target_state_input, next_state_batch)
            q_value_batch = self.critic_network.target_q(
                self.target_state_input, next_state_batch, next_action_batch)
            y_batch = []

            for i in range(len(minibatch)):
                if done_batch[i]:
                    y_batch.append(reward_batch[i])
                else:
                    y_batch.append(reward_batch[i] +
                                   Hp.GAMMA * q_value_batch[i])

            y_batch = np.resize(y_batch, [Hp.batch_size, 1])

            # Update critic by minimizing the loss L
            self.critic_network.train(y_batch, self.state_input, state_batch,
                                      action_batch)

            # Update the actor policy using the sampled gradient:
            action_batch_for_gradients = self.actor_network.actions(
                self.state_input, state_batch)
            q_gradient_batch = self.critic_network.gradients(
                self.state_input, state_batch, action_batch_for_gradients)

            self.summary_str2 = self.actor_network.train(
                q_gradient_batch, self.state_input, state_batch)

            # Update the target networks
            self.actor_network.update_target()
            self.critic_network.update_target()
            self.state_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        state = [np.expand_dims(el, axis=0) for el in state]
        action = self.actor_network.action(state)
        print("no noise ", action)
        return np.clip(
            action +
            self.exploration_noise.noise() * np.array([-17.0, 17.0, 900.0]),
            [-35.0, 0.0, 0.0], [0.0, 35.0, 2000.0])

    def action(self, state):
        state = [np.expand_dims(el, axis=0) for el in state]
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > Hp.REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
示例#5
0
class DDPG:
    def __init__(self, env, state_dim, action_dim):
        self.name = 'DDPG'
        self.environment = env
        self.time_step = 0
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

    def train(self):
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])
        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def action(self, state):
        action = self.actor_network.action(state)

        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)
        if self.replay_buffer.count() == REPLAY_START_SIZE:
            print('\n---------------Start training---------------')
        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.time_step += 1
            self.train()

        if self.time_step % 10000 == 0 and self.time_step > 0:
            self.actor_network.save_network(self.time_step)
            self.critic_network.save_network(self.time_step)

        return self.time_step
示例#6
0
class DDPG:
    def __init__(self, env):
        self.name = 'DDPG'
        self.environment = env
        self.episode = 0
        self.epsilon = 0.98
        self.one_number = 1
        self.mean = []
        self.state_dim = len(obs2state(env.reset().observation))
        self.action_dim = env.action_spec().shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):

        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        self.critic_network.train(y_batch, state_batch, action_batch)
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        action = self.actor_network.action(state)
        exp = self.exploration_noise.noise()
        t = action * exp
        return exp

    def action(self, state):
        if np.random.rand() <= self.epsilon:
            act = self.noise_action(state)
            z = array(act)
        else:
            action = self.actor_network.action(state)
            z = array(action)
        self.mean.append(z[0])
        g = np.tanh(z)
        return g

    def perceive(self, state, action, reward, next_state, done):
        self.replay_buffer.add(state, action, reward, next_state, done)
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()
        if self.epsilon > 0.1:
            self.epsilon *= 0.99999

        if done:
            self.exploration_noise.reset()
示例#7
0
文件: ddpg.py 项目: ChampionZP/DDPG
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env):
        self.name = 'DDPG' # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim)
        self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim)
        
        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch,[BATCH_SIZE,self.action_dim])

        # Calculate y_batch
        
        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,next_action_batch)
        y_batch = []  
        for i in range(len(minibatch)): 
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else :
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch,[BATCH_SIZE,1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch,state_batch,action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch,state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self,state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action+self.exploration_noise.noise()

    def action(self,state):
        action = self.actor_network.action(state)
        return action

    def perceive(self,state,action,reward,next_state,done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state,action,reward,next_state,done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() >  REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
            #self.actor_network.save_network(self.time_step)
            #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
示例#8
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, state_dim, action_dim):
        """name for uploading resuults"""
        self.name = 'DDPG'
        self.time_step = 0
        # self.atten_rate = 1
        """Randomly initialize actor network and critic network"""
        """and both their target networks"""
        self.state_dim = state_dim
        self.action_dim = action_dim

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)
        """initialize replay buffer"""
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        """Initialize a random process the Ornstein-Uhlenbeck process for action exploration"""
        self.exploration_noise = OUNoise(self.action_dim)
        """Initialize a Treading"""
        self.threading = threading.Thread(target=self.train,
                                          name='LoopThread--DDPG')

    def train(self):
        # if self.time_step ==0:
        #     print("Begins Training!!!")
        #print("Training Begins")
        self.time_step += 1
        """Sample a random minibatch of N transitions from replay buffer"""
        """take out BATCH_SIZE sets of data"""
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])
        """resize the action_batch shape to  [BATCH_SIZE, self.action_dim]"""
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])
        """Calculate y_batch(reward)"""
        next_action_batch = self.actor_network.target_action(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        """Update critic by minimizing the loss L (training)"""
        self.critic_network.train(y_batch, state_batch, action_batch)
        """Update the actor policy using the sampled gradient:"""
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)
        """Update the target networks"""
        self.actor_network.update_target()
        self.critic_network.update_target()
        #print("Training Finished")

    def noise_action(self, state):
        """Select action a_t according to the current policy and exploration noise"""
        action = self.actor_network.action(state)
        exp_noise = self.exploration_noise.noise()
        action += exp_noise
        # action[0] = np.clip(action[0], 0, 1)
        # action[1] = np.clip(action[1], -1, 1)
        return action

    def action(self, state):
        action = self.actor_network.action(state)
        # action[0] = np.clip(action[0], 0, 1)
        # action[1] = np.clip(action[1], -1, 1)
        return action

    def perceive(self, state, action, reward, next_state, done):
        """Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer"""
        self.replay_buffer.add(state, action, reward, next_state, done)
        """Store transitions to replay start size then start training"""
        # if self.replay_buffer.count() % 1000 == 0:
        #     print("The buffer count is ", self.replay_buffer.count())
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()
            # self.atten_rate *= 0.99995
            if not self.threading.is_alive():
                self.threading = threading.Thread(target=self.train,
                                                  name='LoopThread--DDPG')
                self.threading.start()
            """SAVE NETWORK"""
            if self.time_step % 100 == 0:
                print("Training_time_step:", self.time_step)
            if self.time_step % 1000 == 0:
                print("!!!!!!!save model success!!!!!!!!")
                self.actor_network.save_network(self.time_step)
                self.critic_network.save_network(self.time_step)
        """Re-iniitialize the random process when an episode ends"""
        if done:
            self.exploration_noise.reset()
示例#9
0
class DDPG:
    """docstring for DDPG"""


    def __init__(self, a_dim, s_dim):
        self.name = 'DDPG'  # name for uploading results
        # self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = s_dim
        self.action_dim = a_dim
        self.time_step=0
        self.max_bw = 0.0
        self.max_cwnd = 0.0
        self.min_rtt = 9999999.0

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def learn(self):
        # print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        self.time_step += 1
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        noise = self.exploration_noise.noise()
        # print("noise:" + str(noise))
        return action + noise

    def choose_action(self, state):
        self.time_step += 1
        # print("_______________________choose_action_____________________")
        action = self.actor_network.action(state)
        return action

    def store_transition(self, s, a, r, s_,done,episode_count):

        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        # print("*********************************ADD****************************")
        self.replay_buffer.add(s, a, r, s_, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            if((episode_count+1)%100!= 0):
                self.learn()
                # print("learn!")
            else:
                self.actor_network.save_network(self.time_step)
                self.critic_network.save_network(self.time_step)


        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def extract_observation(self,dataRecorder,subflow_index,state_before):
        # print("extracting...")
        value_dic = dataRecorder.get_latest_data()
        state_after=state_before.reshape(10,5)
        # observation = np.zeros((4))
        observation = np.zeros((5))
        t_cWnd=[0,0]
        t_thr=[0,0]
        t_rtt=[0,0]
        t_loss_rate=[0,0]
        t_unAck=[0,0]
        s0=[0,0,0,0,0]
        state=np.zeros(1)
        for i in range(value_dic["nbOfSubflows"]):
            name = "cWnd" + str(i)
            t_cWnd[i] = value_dic[name]
            name = "rtt"+str(i)
            t_rtt[i] = value_dic[name]
            name = "unAck" + str(i)
            t_unAck[i]=value_dic[name]
            name = "loss_rate" + str(i)
            t_loss_rate[i]=value_dic[name]
            name = "throughput" + str(i)
            t_thr[i]=value_dic[name]

        thr=t_thr[subflow_index]
        s0[0]=t_thr[subflow_index]

        rtt=t_rtt[subflow_index]
        s0[1]=t_rtt[subflow_index]

        cwnd=t_cWnd[subflow_index]
        s0[2]=t_cWnd[subflow_index]

        loss_rate=t_loss_rate[subflow_index]
        s0[3]=t_loss_rate[subflow_index]

        unAck=t_unAck[subflow_index]
        s0[4]=t_unAck[subflow_index]


        s0=np.array(s0)
        min_=s0-s0

        thr_n=s0[0]
        thr_n_min=s0[0]-min_[0]
        rtt_min=s0[1]-min_[1]
        cwnd_n_min=s0[2]-min_[2]
        loss_rate_n_min=s0[3]-min_[3]
        unAck_n_min=s0[4]-min_[4]

        # loss_rate_n_min=s0[7]-min_[7]

        if self.max_bw<thr_n_min:
            self.max_bw=thr_n_min
        if self.max_cwnd<cwnd_n_min:
            self.max_cwnd=cwnd_n_min
        if self.max_cwnd<cwnd_n_min:
            self.max_cwnd=cwnd_n_min
        if self.min_rtt>rtt_min:
            self.min_rtt=rtt_min

        
        reward  = thr_n_min-5*(rtt_min-self.min_rtt)-10*loss_rate_n_min
        print("reward:"+str(reward)+" thr_n_min:"+str(thr_n_min)+ " rtt_min:"+str(rtt_min)+" self.min_rtt :"+str(self.min_rtt)+"  delta_rtt"+str(rtt_min-self.min_rtt))
        # print("unAck:"+str(unAck_n_min))
        if self.max_bw!=0:
            state[0]=thr_n_min/self.max_bw
            # tmp=pacing_rate_n_min/self.max_bw
            state=np.append(state,[5*loss_rate_n_min])
            state=np.append(state,[unAck_n_min])
        else:
            state[0]=0
            state=np.append(state,[0])
            state=np.append(state,[0])
        state=np.append(state,[1400/cwnd])
        state=np.append(state,[self.min_rtt/rtt_min])

        state_after=np.delete(state_after,[0],axis = 0)
        state_after=np.append(state_after,state)
        

        return state_after,reward,thr_n_min,rtt_min