Python OUNoise.reset 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: ou_noise

클래스/타입: OUNoise

메소드/함수: reset

hotexamples.com에서의 예제들: 37

Python OUNoise.reset - 37개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 ou_noise.OUNoise.reset에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

OUNoise(30)

noise(30)

reset(30)

sample(12)

evolve(3)

generate(2)

noisei(2)

get_noise(1)

reset_state(1)

예제 #1

파일 보기

파일: main.py 프로젝트: mmarklar/ddpg-aigym

def main():
    experiment = 'model-builder-v0'  #specify environments here
    env = gym.make(experiment)
    #steps= env.spec.timestep_limit #steps per episode
    steps = 20
    assert isinstance(env.observation_space,
                      Box), "observation space must be continuous"
    assert isinstance(env.action_space, Box), "action space must be continuous"

    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer
    agent = DDPG(env, is_batch_norm)
    exploration_noise = OUNoise(env.action_space.shape[0])
    counter = 0
    reward_per_episode = 0
    total_reward = 0
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    print("Number of States:", num_states)
    print("Number of Actions:", num_actions)
    print("Number of Steps per episode:", steps)
    #saving reward:
    reward_st = np.array([0])

    for i in range(episodes):
        print("==== Starting episode no:", i, "====", "\n")
        observation = env.reset()
        reward_per_episode = 0
        for t in range(steps):
            #rendering environmet (optional)
            env.render()
            x = observation
            action = agent.evaluate_actor(np.reshape(x, [1, 300, 300, 2]))
            noise = exploration_noise.noise()
            action = action[
                0] + noise  #Select action according to current policy and exploration noise
            print("Action at step", t, " :", action, "\n")

            observation, reward, done, info = env.step(action)

            #add s_t,s_t+1,action,reward to experience memory
            agent.add_experience(x, observation, action, reward, done)
            #train critic and actor network
            if counter > 64:
                agent.train()
            reward_per_episode += reward
            counter += 1
            #check if episode ends:
            if (done or (t == steps - 1)):
                print('EPISODE: ', i, ' Steps: ', t, ' Total Reward: ',
                      reward_per_episode)
                print("Printing reward to file")
                exploration_noise.reset(
                )  #reinitializing random noise for action exploration
                reward_st = np.append(reward_st, reward_per_episode)
                np.savetxt('episode_reward.txt', reward_st, newline="\n")
                print('\n\n')
                break
    total_reward += reward_per_episode
    print("Average reward per episode {}".format(total_reward / episodes))

예제 #2

파일 보기

파일: main.py 프로젝트: stevenpjg/ddpg-aigym

def main():
    experiment= 'InvertedPendulum-v1' #specify environments here
    env= gym.make(experiment)
    steps= env.spec.timestep_limit #steps per episode    
    assert isinstance(env.observation_space, Box), "observation space must be continuous"
    assert isinstance(env.action_space, Box), "action space must be continuous"
    
    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer   
    agent = DDPG(env, is_batch_norm)
    exploration_noise = OUNoise(env.action_space.shape[0])
    counter=0
    reward_per_episode = 0    
    total_reward=0
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]    
    print "Number of States:", num_states
    print "Number of Actions:", num_actions
    print "Number of Steps per episode:", steps
    #saving reward:
    reward_st = np.array([0])
      
    
    for i in xrange(episodes):
        print "==== Starting episode no:",i,"====","\n"
        observation = env.reset()
        reward_per_episode = 0
        for t in xrange(steps):
            #rendering environmet (optional)            
            env.render()
            x = observation
            action = agent.evaluate_actor(np.reshape(x,[1,num_states]))
            noise = exploration_noise.noise()
            action = action[0] + noise #Select action according to current policy and exploration noise
            print "Action at step", t ," :",action,"\n"
            
            observation,reward,done,info=env.step(action)
            
            #add s_t,s_t+1,action,reward to experience memory
            agent.add_experience(x,observation,action,reward,done)
            #train critic and actor network
            if counter > 64: 
                agent.train()
            reward_per_episode+=reward
            counter+=1
            #check if episode ends:
            if (done or (t == steps-1)):
                print 'EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode
                print "Printing reward to file"
                exploration_noise.reset() #reinitializing random noise for action exploration
                reward_st = np.append(reward_st,reward_per_episode)
                np.savetxt('episode_reward.txt',reward_st, newline="\n")
                print '\n\n'
                break
    total_reward+=reward_per_episode            
    print "Average reward per episode {}".format(total_reward / episodes)

예제 #3

파일 보기

파일: main.py 프로젝트: wenjiebit/FCMADRL

def main():
    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer
    agent = DDPG(env, is_batch_norm, CA_OBS_SPACE, CA_ACTION_SPACE,
                 CA_ACTION_BOUND)
    exploration_noise = OUNoise(CA_ACTION_SPACE)
    counter = 0
    reward_per_episode = 0
    total_reward = 0
    num_states = CA_OBS_SPACE
    num_actions = CA_ACTION_SPACE

    print "Number of States:", num_states
    print "Number of Actions:", num_actions
    print "Number of Steps per episode:", steps
    #saving reward:
    reward_st = np.array([0])

    for i in xrange(episodes):
        print "==== Starting episode no:", i, "====", "\n"
        # observation = env.reset()
        observation = ca_reset()
        reward_per_episode = 0
        for t in xrange(steps):
            #rendering environmet (optional)
            # env.render()
            x = observation
            action = agent.evaluate_actor(np.reshape(x, [1, num_states]))
            noise = exploration_noise.noise()
            action = action[
                0] + noise  #Select action according to current policy and exploration noise
            print "Action at step", t, " :", action, "\n"

            # observation,reward,done,info=env.step(action)
            observation, reward, done, info = ca_step(action)
            print x, observation, action, reward, done
            #add s_t,s_t+1,action,reward to experience memory
            agent.add_experience(x, observation, action, reward, done)
            #train critic and actor network
            if counter > 64:
                agent.train()
            reward_per_episode += reward
            counter += 1
            #check if episode ends:
            if (done or (t == steps - 1)):
                print 'EPISODE: ', i, ' Steps: ', t, ' Total Reward: ', reward_per_episode
                print "Printing reward to file"
                exploration_noise.reset(
                )  #reinitializing random noise for action exploration
                reward_st = np.append(reward_st, reward_per_episode)
                np.savetxt('episode_reward.txt', reward_st, newline="\n")
                print '\n\n'
                break
    total_reward += reward_per_episode
    print "Average reward per episode {}".format(total_reward / episodes)

예제 #4

파일 보기

파일: main.py 프로젝트: seongwonleee/CNN-DDPG-locked

def main():
    env = Env(19997)
    steps= 10000
    num_states = 59
    num_actions = 3

    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer   
    agent = DDPG(env, is_batch_norm)
    exploration_noise = OUNoise(num_actions)
    counter=0
    reward_per_episode = 0    
    total_reward=0
    reward_st = np.array([0])

    agent.actor_net.load_actor(os.getcwd() + '/weights/actor/model.ckpt')
    agent.critic_net.load_critic(os.getcwd() + '/weights/critic/model.ckpt')
      
    for i in range(episodes):
        # print "==== Starting episode no:",i,"====","\n"
        observation = env.reset()
        done =False
        reward_per_episode = 0
        for t in range(steps):
            x = observation
            action = agent.evaluate_actor(np.reshape(x,[1,num_states]))
            noise = exploration_noise.noise()
            action = action[0] + noise #Select action according to current policy and exploration noise
            
            for i in range(num_actions):
                if action[i] > 1.0:
                    action[i] = 1.0
                if action[i] < -1.0:
                    action[i] = -1.0

            observation,reward,done = env.step(action)
            print("reward:", reward, "\n")
            agent.add_experience(x,observation,action,reward,done)
            #train critic and actor network
            if counter > 64: 
                agent.train()
            reward_per_episode+=reward
            counter+=1
            #check if episode ends:
            if (done or (t == steps-1)):
                print('Episode',i,'Steps: ',t,'Episode Reward:',reward_per_episode)
                exploration_noise.reset()
                reward_st = np.append(reward_st,reward_per_episode)
                np.savetxt('episode_reward.txt', reward_st, newline="\n")
                agent.actor_net.save_actor(os.getcwd() + '/weights/actor/model.ckpt')
                agent.critic_net.save_critic(os.getcwd() + '/weights/critic/model.ckpt')
                break
    total_reward+=reward_per_episode

예제 #5

파일 보기

파일: main.py 프로젝트: seongwonlee111/CNN-DDPG

def main():
    env= Env(19997)
    steps = 300
    
    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer   
    agent = DDPG(env, is_batch_norm)
    exploration_noise = OUNoise(2)
    counter = 0
    reward_per_episode = 0.
    num_states = 32*16
    num_actions = 2

    #saving reward:
    reward_st = np.array([0])
    
    for i in range(episodes):
        print ("==== Starting episode no:",str(i),"====","\n")
        observation = env.reset()
        reward_per_episode = 0
        for t in range(steps):
            x = observation
            action = agent.evaluate_actor(np.reshape(x,[1,num_states]))
            noise = exploration_noise.noise()
            action = action[0] + noise
            
            observation,reward,done=env.step(action,t)
            agent.add_experience(x,observation,action,reward,done)

            if counter > 64:
                agent.train()
            reward_per_episode+=reward
            counter+=1
            #check if episode ends:
            if (done):
                print ('EPISODE: ',str(i),' Steps: ',str(t),' Total Reward: ',str(reward_per_episode))
                exploration_noise.reset() #reinitializing random noise for action exploration
                reward_st = np.append(reward_st,reward_per_episode)
                np.savetxt('episode_reward.txt',reward_st, newline="\n")
                agent.actor_net.save_actor('/home/lee/Projects/Tracking/RL/weights/actor/model.ckpt')
                agent.critic_net.save_critic('/home/lee/Projects/Tracking/RL/weights/critic/model.ckpt')
                print ('\n\n')
                break

예제 #6

파일 보기

파일: ddpg_rec.py 프로젝트: jasonyanglu/DRL_REC

class DDPG_REC:

    def __init__(self, state_item_num, action_item_num, emb_dim, batch_size, tau, actor_lr, critic_lr,
                 gamma, buffer_size, item_space, summary_dir):

        self.state_item_num = state_item_num
        self.action_item_num = action_item_num
        self.emb_dim = emb_dim
        self.batch_size = batch_size
        self.tau = tau
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.gamma = gamma
        self.buffer_size = buffer_size
        self.item_space = item_space
        self.summary_dir = summary_dir

        self.sess = tf.Session()

        self.s_dim = emb_dim * state_item_num
        self.a_dim = emb_dim * action_item_num
        self.actor = Actor(self.sess, state_item_num, action_item_num, emb_dim, batch_size, tau, actor_lr)
        self.critic = Critic(self.sess, state_item_num, action_item_num, emb_dim,
                             self.actor.get_num_trainable_vars(), gamma, tau, critic_lr)
        self.exploration_noise = OUNoise(self.a_dim)

        # set up summary operators
        self.summary_ops, self.summary_vars = self.build_summaries()
        self.sess.run(tf.global_variables_initializer())
        self.writer = tf.summary.FileWriter(summary_dir, self.sess.graph)

        # initialize target network weights
        self.actor.hard_update_target_network()
        self.critic.hard_update_target_network()

        # initialize replay memory
        self.replay_buffer = ReplayBuffer(buffer_size)

    def gene_actions(self, weight_batch):
        """use output of actor network to calculate action list
        Args:
            weight_batch: actor network outputs

        Returns:
            recommendation list
        """
        item_ids = list(self.item_space.keys())
        item_weights = list(self.item_space.values())
        max_ids = list()
        for weight in weight_batch:
            score = np.dot(item_weights, np.transpose(weight))
            idx = np.argmax(score, 0)
            max_ids.append([item_ids[_] for _ in idx])
        return max_ids

    # def gene_action(self, weight):
    #     """use output of actor network to calculate action list
    #     Args:
    #         weight: actor network outputs
    #
    #     Returns:
    #         recommendation list
    #     """
    #     item_ids = list(self.item_space.keys())
    #     item_weights = list(self.item_space.values())
    #     score = np.dot(item_weights, np.transpose(weight))
    #     idx = np.argmax(score)
    #     return item_ids[idx]

    @staticmethod
    def build_summaries():
        episode_reward = tf.Variable(0.)
        tf.summary.scalar("reward", episode_reward)
        episode_max_q = tf.Variable(0.)
        tf.summary.scalar("max_q_value", episode_max_q)
        critic_loss = tf.Variable(0.)
        tf.summary.scalar("critic_loss", critic_loss)

        summary_vars = [episode_reward, episode_max_q, critic_loss]
        summary_ops = tf.summary.merge_all()
        return summary_ops, summary_vars

    def _train(self):
        samples = self.replay_buffer.sample_batch(self.batch_size)
        state_batch = np.asarray([_[0] for _ in samples])
        action_batch = np.asarray([_[1] for _ in samples])
        reward_batch = np.asarray([_[2] for _ in samples])
        n_state_batch = np.asarray([_[3] for _ in samples])
        done_batch = np.asarray([_[4] for _ in samples])

        seq_len_batch = np.asarray([self.state_item_num] * self.batch_size)

        # calculate predicted q value
        action_weights = self.actor.predict_target(state_batch, seq_len_batch)  # [batch_size,
        n_action_batch = self.gene_actions(action_weights.reshape((-1, self.action_item_num, self.emb_dim)))
        n_action_emb_batch = get_item_emb(n_action_batch, item_ids_emb_dict)
        target_q_batch = self.critic.predict_target(n_state_batch.reshape((-1, self.s_dim)),
                                                    n_action_emb_batch.reshape((-1, self.a_dim)), seq_len_batch)
        y_batch = []
        for i in range(self.batch_size):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + self.critic.gamma * target_q_batch[i])

        # train critic
        q_value, critic_loss, _ = self.critic.train(state_batch, action_batch,
                                                    np.reshape(y_batch, (self.batch_size, 1)), seq_len_batch)

        # train actor
        action_weight_batch_for_gradients = self.actor.predict(state_batch, seq_len_batch)
        action_batch_for_gradients = self.gene_actions(action_weight_batch_for_gradients)
        action_emb_batch_for_gradients = get_item_emb(action_batch_for_gradients, item_ids_emb_dict)
        a_gradient_batch = self.critic.action_gradients(state_batch,
                                                        action_emb_batch_for_gradients.reshape((-1, self.a_dim)),
                                                        seq_len_batch)
        self.actor.train(state_batch, a_gradient_batch[0], seq_len_batch)

        # update target networks
        self.actor.update_target_network()
        self.critic.update_target_network()

        return np.amax(q_value), critic_loss

    def action(self, state):
        weight = self.actor.predict(np.reshape(state, [1, self.s_dim]), np.array([self.state_item_num])) + \
                 self.exploration_noise.noise().reshape(
                     (1, self.action_item_num, int(self.a_dim / self.action_item_num)))
        action = self.gene_actions(weight)
        return np.array(action[0])

    def perceive_and_train(self, state, action, reward, n_state, done):
        action_emb = get_item_emb(action, item_ids_emb_dict)
        self.replay_buffer.add(list(state.reshape((self.s_dim,))),
                               list(action_emb.reshape((self.a_dim,))),
                               [reward],
                               list(n_state.reshape((self.s_dim,))),
                               [done])

        # Store transitions to replay start size then start training
        ep_q_value_, critic_loss = 0, 0
        if self.replay_buffer.size() > self.batch_size:
            ep_q_value_, critic_loss = self._train()

        # if self.time_step % 10000 == 0:
        # self.actor_network.save_network(self.time_step)
        # self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

        return ep_q_value_, critic_loss

    def write_summary(self, ep_reward, ep_q_value, loss, i):
        summary_str = self.sess.run(self.summary_ops, feed_dict={self.summary_vars[0]: ep_reward,
                                                                 self.summary_vars[1]: ep_q_value,
                                                                 self.summary_vars[2]: loss})
        self.writer.add_summary(summary_str, i)

    def save(self):
        self.writer.close()
        saver = tf.train.Saver()
        ckpt_path = os.path.join(os.path.dirname(__file__), "models")
        saver.save(self.sess, ckpt_path, write_meta_graph=False)

예제 #7

파일 보기

파일: main.py 프로젝트: zxqzhang/ddpg-aigym

def main():
    experiment= 'InvertedPendulum-v1'
    env= gym.make(experiment)
    assert isinstance(env.observation_space, Box), "observation space must be continuous"
    assert isinstance(env.action_space, Box), "action space must be continuous"
    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer   
    agent = DDPG(env)
    exploration_noise = OUNoise(env.action_space.shape[0])
    counter=0
    total_reward=0
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    
    #saving reward:
    reward_st = np.array([0])
    
    
    
    for i in xrange(episodes):
        observation = env.reset()
    
        reward_per_episode = 0
        for t in xrange(steps):
            #rendering environmet (optional)            
            #env.render()
            
            x = observation
            #select action using actor network model
            action = agent.evaluate_actor(np.reshape(x,[num_actions,num_states]))
            
            noise = exploration_noise.noise()
            
                       
            action = action[0] + noise
            
            
            print 'Agent.Action :',action
            print '\n'
            print '\n'
            
                      
            observation,reward,done,[]=env.step(action)
            #add s_t,s_t+1,action,reward to experience memeroy
            agent.add_experience(x,observation,action,reward,done)
            #train critic and actor network
            if counter > 64: 
                agent.train()            
            
            reward_per_episode+=reward
            
            counter+=1
            #check if episode ends:
            if done:
                print 'EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode
                exploration_noise.reset()
                reward_st = np.append(reward_st,reward_per_episode)
                np.savetxt('episode_reward.txt',reward_st, newline="\n")
                print '\n'
                print '\n'
                break
    total_reward+=reward_per_episode            
    print "Average reward per episode {}".format(total_reward / episodes)

예제 #8

파일 보기

파일: ddpg.py 프로젝트: MaidouPP/robot_follower

class DDPG:
    def __init__(self, pretrain=False):

        # Make sure all the directories exist
        if not tf.gfile.Exists(TFLOG_PATH):
            tf.gfile.MakeDirs(TFLOG_PATH)
        if not tf.gfile.Exists(EXPERIENCE_PATH):
            tf.gfile.MakeDirs(EXPERIENCE_PATH)
        if not tf.gfile.Exists(NET_SAVE_PATH):
            tf.gfile.MakeDirs(NET_SAVE_PATH)

        # Initialize our session
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        self.session = tf.Session(config=config)
        # self.session = tf.Session()
        self.graph = self.session.graph

        with self.graph.as_default():

            # View the state batches
            # self.visualize_input = VISUALIZE_BUFFER
            # if self.visualize_input:
            #     self.viewer = CostmapVisualizer()

            # Hardcode input size and action size
            self.height = 662
            self.width = 1
            self.depth = 4
            self.action_dim = 2

            # Initialize the current action and the old action and old state for setting experiences
            self.old_state = np.zeros((self.width, self.height, self.depth),
                                      dtype='float32')
            self.old_action = np.ones(2, dtype='float32')
            self.network_action = np.zeros(2, dtype='float32')
            self.noise_action = np.zeros(2, dtype='float32')
            self.action = np.zeros(2, dtype='float32')

            # Initialize the grad inverter object to keep the action bounds
            self.grad_inv = GradInverter(A0_BOUNDS, A1_BOUNDS, self.session)

            # Make sure the directory for the data files exists
            if not tf.gfile.Exists(DATA_PATH):
                tf.gfile.MakeDirs(DATA_PATH)

            # Initialize summary writers to plot variables during training
            self.summary_op = tf.summary.merge_all()
            self.summary_writer = tf.summary.FileWriter(TFLOG_PATH)

            # Initialize actor and critic networks
            self.actor_network = ActorNetwork(self.height, self.action_dim,
                                              self.depth, self.session,
                                              self.summary_writer)
            self.critic_network = CriticNetwork(self.height, self.action_dim,
                                                self.depth, self.session,
                                                self.summary_writer)

            # Initialize the saver to save the network params
            self.saver = tf.train.Saver()

            # initialize the experience data manger
            self.data_manager = DataManager(BATCH_SIZE, EXPERIENCE_PATH,
                                            self.session)

            # Uncomment if collecting a buffer for the autoencoder
            # self.buffer = deque()

            # Should we load the pre-trained params?
            # If so: Load the full pre-trained net
            # Else:  Initialize all variables the overwrite the conv layers with the pretrained filters
            if PRE_TRAINED_NETS:
                self.saver.restore(self.session, NET_LOAD_PATH)
            else:
                self.session.run(tf.initialize_all_variables())

            tf.train.start_queue_runners(sess=self.session)
            time.sleep(1)

            # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
            self.exploration_noise = OUNoise(self.action_dim, MU, THETA, SIGMA)
            self.noise_flag = True

            # Initialize time step
            self.training_step = 0

            # Flag: don't learn the first experience
            self.first_experience = True

            # After the graph has been filled add it to the summary writer
            self.summary_writer.add_graph(self.graph)

    def train(self):

        # Check if the buffer is big enough to start training
        if self.data_manager.enough_data():

            # start_ = time.time()

            # get the next random batch from the data manger
            state_batch, \
                action_batch, \
                reward_batch, \
                next_state_batch, \
                is_episode_finished_batch = self.data_manager.get_next_batch()

            state_batch = np.divide(state_batch, 10.0)
            next_state_batch = np.divide(next_state_batch, 10.0)

            # Are we visualizing the first state batch for debugging?
            # If so: We have to scale up the values for grey scale before plotting
            # if self.visualize_input:
            #     state_batch_np = np.asarray(state_batch)
            #     state_batch_np = np.multiply(state_batch_np, -100.0)
            #     state_batch_np = np.add(state_batch_np, 100.0)
            #     self.viewer.set_data(state_batch_np)
            #     self.viewer.run()
            #     self.visualize_input = False

            # Calculate y for the td_error of the critic

            # start = time.time()
            y_batch = []
            next_action_batch = self.actor_network.target_evaluate(
                next_state_batch, action_batch)
            q_value_batch = self.critic_network.target_evaluate(
                next_state_batch, next_action_batch)
            # done = time.time()
            # elapsed = done - start
            # print "forward actor and critic time is: ", elapsed

            for i in range(0, BATCH_SIZE):
                if is_episode_finished_batch[i]:
                    y_batch.append([reward_batch[i]])
                else:
                    y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

            # Now that we have the y batch lets train the critic
            # start = time.time()
            self.critic_network.train(y_batch, state_batch, action_batch)
            # done = time.time()
            # elapsed = done - start
            # print "train critic time is: ", elapsed

            # self.critic_network.train(y_batch, state_batch, action_batch)

            # Get the action batch so we can calculate the action gradient with it
            # Then get the action gradient batch and adapt the gradient with the gradient inverting method
            # start = time.time()
            action_batch_for_gradients = self.actor_network.evaluate(
                state_batch, action_batch)
            # done = time.time()
            # elapsed = done - start
            # print "forward action after critic training time is: ", elapsed

            q_gradient_batch = self.critic_network.get_action_gradient(
                state_batch, action_batch_for_gradients)
            q_gradient_batch = self.grad_inv.invert(
                q_gradient_batch, action_batch_for_gradients)

            # Now we can train the actor
            # start = time.time()
            self.actor_network.train(q_gradient_batch, state_batch,
                                     action_batch)
            # done = time.time()
            # elapsed = done - start
            # print "train actor time is: ", elapsed

            # done = time.time()
            # elapsed = done - start_
            # print "====== total time is: ", elapsed

            # Save model if necessary
            if self.training_step > 0 and self.training_step % SAVE_STEP == 0:
                self.saver.save(self.session,
                                NET_SAVE_PATH,
                                global_step=self.training_step)

            # Update time step
            self.training_step += 1

            if self.training_step % 400 == 0:
                print "iter: ", self.training_step

        # start_ = time.time()
        self.data_manager.check_for_enqueue()
        # done = time.time()
        # elapsed = done - start_
        # print "############ check enqueue time is: ", elapsed

    def get_action(self, state, old_action):

        # normalize the state
        state = state.astype(float)
        state = np.divide(state, 10.0)

        # Get the action
        self.action = self.actor_network.get_action(state, old_action)
        self.action = self.action.reshape((2, ))

        # Are we using noise?
        if self.noise_flag:
            # scale noise down to 0 at training step 3000000
            self.action = 0.8 * self.exploration_noise.noise()
#            if self.training_step < MAX_NOISE_STEP:
#                self.action += (MAX_NOISE_STEP - self.training_step) / \
#                    MAX_NOISE_STEP * self.exploration_noise.noise()
# if action value lies outside of action bounds, rescale the action vector
#            if self.action[0] < A0_BOUNDS[0] or self.action[0] > A0_BOUNDS[1]:
#                self.action *= np.fabs(A0_BOUNDS[0] / self.action[0])
#            if self.action[1] < A0_BOUNDS[0] or self.action[1] > A0_BOUNDS[1]:
#                self.action *= np.fabs(A1_BOUNDS[0] / self.action[1])

# Life q value output for this action and state
        self.print_q_value(state, self.action)

        return self.action

    def set_experience(self, state, reward, is_episode_finished):

        # Make sure we're saving a new old_state for the first experience of every episode
        if self.first_experience:
            self.first_experience = False
        else:
            state.astype('float32')
            self.old_action.astype('float32')
            self.old_action.astype('float32')
            self.data_manager.store_experience_to_file(self.old_state,
                                                       self.old_action, reward,
                                                       state,
                                                       is_episode_finished)

            # Uncomment if collecting data for the auto_encoder
            # experience = (self.old_state, self.old_action, reward, state, is_episode_finished)
            # self.buffer.append(experience)

        if is_episode_finished:
            self.first_experience = True
            self.exploration_noise.reset()

        # Safe old state and old action for next experience
        self.old_state = state
        self.old_action = self.action

    def print_q_value(self, state, action):

        string = "-"
        q_value = self.critic_network.evaluate([state], [action])
        stroke_pos = 30 * q_value[0][0] + 30
        if stroke_pos < 0:
            stroke_pos = 0
        elif stroke_pos > 60:
            stroke_pos = 60

예제 #9

파일 보기

파일: agent.py 프로젝트: SiweiWang/RL-Quadcopter

class DDPG():
    """Reinforcement learning agent that learns using DDPG."""
    def __init__(self, task, train=True):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high


        # Set the learning rate suggested by paper:  https://pdfs.semanticscholar.org/71f2/03de1a53deae81a7707143f0ed564661e279.pdf
        self.actor_learning_rate = 0.001
        self.actor_decay = 0.0
        self.critic_learning_rate = 0.001
        self.critic_decay = 0.0

        # Actor Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate, self.actor_decay)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate, self.actor_decay)

        # Critic Model
        self.critic_local = Critic(self.state_size, self.action_size, self.critic_learning_rate, self.critic_decay)
        self.critic_target = Critic(self.state_size, self.action_size, self.critic_learning_rate, self.critic_decay)

        # initialize targets model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        # self.exploration_theta = 0.15
        # self.exploration_sigma = 0.2
        self.exploration_theta = 0.01
        self.exploration_sigma = 0.02
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta,
                   self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000

        self.batch_size = 64

        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        self.best_w = None
        self.best_score = -np.inf
        # self.noise_scale = 0.7
        self.score = 0

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01 # for soft update of target parameters

        # Indicate if we want to learn (or use to predict without learn)
        self.set_train(train)

    def reset_episode(self):
        self.total_reward = 0.0
        self.score = 0
        self.step_count = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):

        self.total_reward += reward
        self.step_count += 1
        # Save experience /reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        self.score = self.total_reward / float(self.step_count) if self.step_count else 0.0
        # Update the noise factor depending on the new score value
        if  self.score >= self.best_score:
            self.best_score = self.score
       
        # Learn, if enough samples are available in memory
        if self.train and len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, done)

        # Roll over last state and action
        self.last_state= next_state

    def act(self, state):
        """Returns actions for given state(s)  as per current policy"""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample()) # add more noise for exploration

    def learn(self, experiences, done):
        """Update policy and value parameters using give batch experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards  = np.array([e.reward for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)

        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_state = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        # Q_targets_next = critic_target(next_state, actor_target(next_state))
        next_action = self.actor_target.model.predict_on_batch(next_state)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_state, next_action])

        # Compute Q targets for current states and train critic model(local)
        Q_targets = rewards + self.gamma * Q_targets_next * ( 1- dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]),
                            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])

        # Soft-update target method

        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())


        assert len(local_weights) == len(target_weights), "Local and target model parameters mush have the same size"
        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)

    def set_train(self, train):
        self.train = train

예제 #10

파일 보기

파일: maddpg_agent.py 프로젝트: mkolod/deep-reinforcement-learning

class DDPG:
    def __init__(self,
                state_size,
                action_size,                
                tau,
                lr_actor,
                lr_critic,
                num_agents,
                agent_idx,
                seed,
                device,
                gamma,
                tensorboard_writer=None):
        
        self.state_size = state_size
        self.action_size = action_size
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.num_agents = num_agents
        self.agent_idx = agent_idx
        self.seed = seed       
        self.device = device
        self.gamma = gamma
        random.seed(seed)
        self.tensorboard_writer = tensorboard_writer        
        
        self.actor_local = Actor(state_size, action_size, seed)
        self.actor_target = Actor(state_size, action_size, seed)
        
        critic_state_size = (state_size + action_size) * num_agents
        
        self.critic_local = Critic(critic_state_size, seed)
        self.critic_target = Critic(critic_state_size, seed)
        
        hard_update(self.actor_local, self.actor_target)
        hard_update(self.critic_local, self.critic_target) 
        
        self.actor_optim = torch.optim.Adam(self.actor_local.parameters(), lr=lr_actor)
        self.critic_optim = torch.optim.Adam(self.critic_local.parameters(), lr=lr_critic)
        
        self.noise = OUNoise(action_size, seed)
        
        self.iteration = 0
        
    def to(self, device):
        self.actor_local.to(device)
        self.actor_target.to(device)
        self.critic_local.to(device)
        self.critic_target.to(device)
        return self
                             
    def act(self, state, noise_scale, use_noise=True):
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if use_noise:
            action += self.noise.sample() * noise_scale
        return np.clip(action, -1, 1)
    
    def learn(self, experiences, all_curr_pred_actions, all_next_pred_actions):
        
        agent_idx_device = torch.tensor(self.agent_idx).to(self.device)
        
        states, actions, rewards, next_states, dones = experiences

        rewards = rewards.index_select(1, agent_idx_device)
        dones = dones.index_select(1, agent_idx_device)
        
        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
                
        batch_size = next_states.shape[0]
        
        actions_next = torch.cat(all_next_pred_actions, dim=1).to(self.device)
        next_states = next_states.reshape(batch_size, -1)      
        
        with torch.no_grad():
            Q_targets_next = self.critic_target(next_states, actions_next)
        
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        
        # Compute critic loss
        states = states.reshape(batch_size, -1)
        actions = actions.reshape(batch_size, -1)
        
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets.detach())
        # Minimize the loss
        self.critic_optim.zero_grad()
        critic_loss.backward()
        self.critic_optim.step()
        
        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        self.actor_optim.zero_grad()
        predicted_actions = torch.cat([action if idx == self.agent_idx \
                   else action.detach()
                   for idx, action in enumerate(all_curr_pred_actions)],
                   dim=1).to(self.device)

        actor_loss = -self.critic_local(states, predicted_actions).mean()
        # minimize loss
        actor_loss.backward()
        self.actor_optim.step()
        
        al = actor_loss.cpu().detach().item()
        cl = critic_loss.cpu().detach().item()
        
        if self.tensorboard_writer is not None:            
            self.tensorboard_writer.add_scalar("agent{}/actor_loss".format(self.agent_idx), al, self.iteration)
            self.tensorboard_writer.add_scalar("agent{}/critic_loss".format(self.agent_idx), cl, self.iteration)
            self.tensorboard_writer.file_writer.flush()
            
        self.iteration += 1

        # ----------------------- update target networks ----------------------- #
        soft_update(self.critic_target, self.critic_local, self.tau)
        soft_update(self.actor_target, self.actor_local, self.tau)           

    
    def reset(self):
        self.noise.reset()

예제 #11

파일 보기

class PilotNode(object):
    def __init__(self, model, logfolder):
        print('initialize pilot node')

        np.random.seed(FLAGS.random_seed)
        tf.set_random_seed(FLAGS.random_seed)

        # Initialize replay memory
        self.logfolder = logfolder
        self.world_name = ''
        self.logfile = logfolder + '/tensorflow_log'
        self.run = 0
        self.run_eva = 0
        self.maxy = -10
        self.speed = FLAGS.speed
        self.accumlosses = {}
        self.current_distance = 0
        self.furthest_point = 0
        self.average_distance = 0
        self.average_distance_eva = 0
        self.last_pose = []
        self.model = model
        self.ready = False
        self.finished = True
        self.target_control = []
        self.target_depth = []
        self.target_odom = []
        self.aux_depth = []
        self.aux_odom = []
        self.odom_error = []
        self.prev_control = [0]
        self.nfc_images = [
        ]  #used by n_fc networks for building up concatenated frames
        self.nfc_poses = []  #used by n_fc networks for calculating odometry
        rospy.init_node('pilot', anonymous=True)
        self.exploration_noise = OUNoise(4, 0, FLAGS.ou_theta, 1)
        self.state = []

        # self.delay_evaluation = 5 #can't be set by ros because node is started before ros is started...
        if FLAGS.show_depth:
            self.depth_pub = rospy.Publisher('/depth_prediction',
                                             numpy_msg(Floats),
                                             queue_size=1)
        if FLAGS.show_odom:
            self.odom_pub = rospy.Publisher('/odom_prediction',
                                            numpy_msg(Floats),
                                            queue_size=1)
        # if FLAGS.off_policy:
        #   self.action_pub = rospy.Publisher('/supervised_vel', Twist, queue_size=1)
        #   if rospy.has_param('control'):
        #     rospy.Subscriber(rospy.get_param('control'), Twist, self.supervised_callback)
        if FLAGS.real or FLAGS.off_policy:
            self.action_pub = rospy.Publisher('/pilot_vel',
                                              Twist,
                                              queue_size=1)
        else:
            rospy.Subscriber('/supervised_vel', Twist,
                             self.supervised_callback)
            if rospy.has_param('control'):
                self.action_pub = rospy.Publisher(rospy.get_param('control'),
                                                  Twist,
                                                  queue_size=1)
        if rospy.has_param('ready'):
            rospy.Subscriber(rospy.get_param('ready'), Empty,
                             self.ready_callback)
        if rospy.has_param('finished'):
            rospy.Subscriber(rospy.get_param('finished'), Empty,
                             self.finished_callback)
        if rospy.has_param('rgb_image') and not FLAGS.depth_input:
            rospy.Subscriber(rospy.get_param('rgb_image'), Image,
                             self.image_callback)
        if rospy.has_param('depth_image'):
            if FLAGS.depth_input or FLAGS.auxiliary_depth or FLAGS.rl:
                rospy.Subscriber(rospy.get_param('depth_image'), Image,
                                 self.depth_callback)
        if FLAGS.recovery_cameras:
            # callbacks={'left':{'30':image_callback_left_30,'60':image_callback_left_60},'right':{'30':image_callback_right_30,'60':image_callback_right_60}}
            # callbacks_depth={'left':{'30':depth_callback_left_30,'60':depth_callback_left_60},'right':{'30':depth_callback_right_30,'60':depth_callback_right_60}}
            self.recovery_images = {}
            for d in ['left', 'right']:
                self.recovery_images[d] = {}
                for c in ['30', '60']:
                    self.recovery_images[d][c] = {}
                    self.recovery_images[d][c]['rgb'] = []
                    self.recovery_images[d][c]['depth'] = []
                    rospy.Subscriber(
                        re.sub(r"kinect", "kinect_" + d + "_" + c,
                               rospy.get_param('rgb_image')), Image,
                        self.image_callback_recovery, (d, c))
                    rospy.Subscriber(
                        re.sub(r"kinect", "kinect_" + d + "_" + c,
                               rospy.get_param('depth_image')), Image,
                        self.depth_callback_recovery, (d, c))

        if not FLAGS.real:  # in simulation
            self.replay_buffer = ReplayBuffer(FLAGS.buffer_size,
                                              FLAGS.random_seed)
            self.accumloss = 0
            rospy.Subscriber('/ground_truth/state', Odometry, self.gt_callback)

    def ready_callback(self, msg):
        if not self.ready and self.finished:
            print('Neural control activated.')
            self.ready = True
            self.start_time = rospy.get_time()
            self.finished = False
            self.exploration_noise.reset()
            self.speed = FLAGS.speed + (
                not FLAGS.evaluate) * np.random.uniform(
                    -FLAGS.sigma_x, FLAGS.sigma_x)
            if rospy.has_param('evaluate') and not FLAGS.real:
                # FLAGS.evaluate = False
                FLAGS.evaluate = rospy.get_param('evaluate')
                print '--> set evaluate to: ', FLAGS.evaluate
            # if FLAGS.lstm:
            #   self.state=self.model.get_init_state(True)
            #   print 'set state to: ', self.state
            if rospy.has_param('world_name'):
                self.world_name = os.path.basename(
                    rospy.get_param('world_name').split('.')[0])
                if 'sandbox' in self.world_name: self.world_name = 'sandbox'

    def gt_callback(self, data):
        if not self.ready: return
        # Keep track of positions for logging

        current_pos = [
            data.pose.pose.position.x, data.pose.pose.position.y,
            data.pose.pose.position.z
        ]
        if len(self.last_pose) != 0:
            self.current_distance += np.sqrt(
                (self.last_pose[0, 3] - current_pos[0])**2 +
                (self.last_pose[1, 3] - current_pos[1])**2)
        self.furthest_point = max([
            self.furthest_point,
            np.sqrt(current_pos[0]**2 + current_pos[1]**2)
        ])

        # Get pose (rotation and translation) for odometry
        quaternion = (data.pose.pose.orientation.x,
                      data.pose.pose.orientation.y,
                      data.pose.pose.orientation.z,
                      data.pose.pose.orientation.w)
        self.last_pose = transformations.quaternion_matrix(
            quaternion
        )  # orientation of current frame relative to global frame
        self.last_pose[0:3, 3] = current_pos

    def process_rgb(self, msg):
        # self.time_1 = time.time()
        # if not self.ready or self.finished or (rospy.get_time()-self.start_time) < self.delay_evaluation: return
        if not self.ready or self.finished: return []
        try:
            # Convert your ROS Image message to OpenCV2
            im = bridge.imgmsg_to_cv2(
                msg, 'rgb8'
            )  # changed to normal RGB order as i ll use matplotlib and PIL instead of opencv
            # an idea could be to swap these channels during online training as this shouldnt matter though this could
            # explain the performance drop coming from a pretrained network.
            # This does mean that online trained nets might be worth nothing...
            # im = bridge.imgmsg_to_cv2(msg, 'bgr8')
        except CvBridgeError as e:
            print(e)
        else:
            # self.time_2 = time.time()
            size = self.model.input_size[1:]
            im = sm.imresize(im, tuple(size), 'nearest')
            # im = im*1/255.
            # Basic preprocessing: center + make 1 standard deviation
            # im -= FLAGS.mean
            # im = im*1/FLAGS.std
            return im

    def process_depth(self, msg):
        # if not self.ready or self.finished or (rospy.get_time()-self.start_time) < self.delay_evaluation: return
        if not self.ready or self.finished: return []
        try:
            # Convert your ROS Image message to OpenCV2
            im = bridge.imgmsg_to_cv2(msg, desired_encoding='passthrough'
                                      )  #gets float of 32FC1 depth image
        except CvBridgeError as e:
            print(e)
        else:
            im = im[::8, ::8]
            shp = im.shape
            # assume that when value is not a number it is due to a too large distance
            # values can be nan for when they are closer than 0.5m but than the evaluate node should
            # kill the run anyway.
            im = np.asarray([
                e * 1.0 if not np.isnan(e) else 5 for e in im.flatten()
            ]).reshape(shp)  # clipping nans: dur: 0.010
            # print 'min: ',np.amin(im),' and max: ',np.amax(im)
            # im=np.asarray([ e*1.0 if not np.isnan(e) else 0 for e in im.flatten()]).reshape(shp) # clipping nans: dur: 0.010
            # Resize image
            if FLAGS.auxiliary_depth or FLAGS.rl:
                size = self.model.depth_input_size  #(55,74)
                im = sm.imresize(im, size, 'nearest')  # dur: 0.002
                # cv2.imshow('depth', im) # dur: 0.002
            if FLAGS.depth_input:
                size = (self.model.input_size[1], self.model.input_size[1])
                im = sm.imresize(im, size, 'nearest')  # dur: 0.009
                im = im[im.shape[0] / 2, :]
                # cv2.imshow('depth', im.reshape(1,im.shape[0])) # dur: 0.002
            # cv2.waitKey(2)
            im = im * 1 / 255. * 5.  # dur: 0.00004
            return im

    def image_callback(self, msg):
        im = self.process_rgb(msg)
        if len(im) != 0:
            if FLAGS.n_fc:
                self.nfc_images.append(im)
                self.nfc_poses.append(copy.deepcopy(self.last_pose))
                if len(self.nfc_images) < FLAGS.n_frames:
                    # print('filling concatenated frames: ',len(self.nfc_images))
                    return
                else:
                    # concatenate last n-frames
                    im = np.concatenate(np.asarray(
                        self.nfc_images[-FLAGS.n_frames:]),
                                        axis=2)
                    self.nfc_images = self.nfc_images[
                        -FLAGS.n_frames + 1:]  # concatenate last n-1-frames

                    self.nfc_poses.pop(0)  #get rid of the first one
                    assert len(self.nfc_poses) == FLAGS.n_frames - 1
                    # # calculate target odometry from previous global pose and current global pose
                    # euler = transformations.euler_from_matrix(self.nfc_poses[1], 'rxyz')
                    # # print 'current: ',str(euler[2]),str(self.nfc_poses[1][0,3]),str(self.nfc_poses[1][1,3])
                    # i_T_pg = transformations.inverse_matrix(self.nfc_poses[0])
                    # euler = transformations.euler_from_matrix(i_T_pg, 'rxyz')
                    # # print 'inverse prev: ',str(euler[2]), str(i_T_pg[0,3]),str(i_T_pg[1,3])
                    # T_cp = transformations.concatenate_matrices(i_T_pg, self.nfc_poses[1])
                    # r,p,yw = transformations.euler_from_matrix(T_cp, 'rxyz')
                    # x,y,z = T_cp[0:3,3]
                    # self.target_odom = [x,y,z,r,p,yw]
                    # print 'odom: ',str(self.target_odom[5]),str(self.target_odom[0]),str(self.target_odom[1])
                    # self.target_odom = [self.nfc_poses[1][i]-self.nfc_poses[0][i] for i in range(len(self.nfc_poses[0]))]
                    # print 'Target odometry: ', self.target_odom
            self.process_input(im)

    def image_callback_recovery(self, msg, args):
        im = self.process_rgb(msg)
        if len(im) == 0: return
        trgt = -100.
        if FLAGS.auxiliary_depth and len(
                self.recovery_images[args[0]][args[1]]['depth']) == 0:
            print("No target depth: {0} {1}".format(args[0], args[1]))
            return
        else:
            trgt_depth = copy.deepcopy(
                self.recovery_images[args[0]][args[1]]['depth'])
        if len(self.target_control) == 0:
            print("No target control: {0} {1}".format(args[0], args[1]))
            return
        else:
            # left ==> -1, right ==> +1, 30dg ==> 0.5, 60dg ==> 1.0
            compensation = -(args[0] == 'left') * int(
                args[1]) / 60. + (args[0] == 'right') * int(args[1]) / 60.
            trgt = compensation + self.target_control[5]
        if FLAGS.experience_replay and not FLAGS.evaluate and trgt != -100:
            if FLAGS.auxiliary_depth:
                print('added experience of camera: {0} {1} with control {2}'.
                      format(args[0], args[1], trgt))
                self.replay_buffer.add(im, [trgt], [trgt_depth])
            else:
                self.replay_buffer.add(im, [trgt])

    def depth_callback(self, msg):
        im = self.process_depth(msg)
        if len(im) != 0:
            if FLAGS.auxiliary_depth or FLAGS.rl:
                self.target_depth = im  #(64,)
            if FLAGS.depth_input:
                if FLAGS.network == 'nfc_control':
                    self.nfc_images.append(im)
                    if len(self.nfc_images) < 4:
                        # print('filling concatenated frames: ',len(self.nfc_images))
                        return
                    else:
                        # print np.asarray(self.nfc_images).shape
                        im = np.concatenate(np.asarray(self.nfc_images))
                        # print im.shape
                        self.nfc_images.pop(0)
                self.process_input(im)

    def depth_callback_recovery(self, msg, args):
        im = self.process_depth(msg)
        self.recovery_images[args[0]][args[1]]['depth'] = im

    def process_input(self, im):
        self.time_3 = time.time()
        trgt = -100.
        # if self.target_control == None or FLAGS.evaluate:
        if FLAGS.evaluate:  ### EVALUATE
            trgt_depth = []
            trgt_odom = []
            with_loss = False
            if len(
                    self.target_control
            ) != 0 and not FLAGS.auxiliary_depth and not FLAGS.auxiliary_odom:
                trgt = self.target_control[5]
                with_loss = True
            elif len(self.target_control
                     ) != 0 and FLAGS.auxiliary_depth and len(
                         self.target_depth) != 0 and not FLAGS.auxiliary_odom:
                trgt = self.target_control[5]
                trgt_depth = [copy.deepcopy(self.target_depth)]
                with_loss = True
            elif len(
                    self.target_control
            ) != 0 and not FLAGS.auxiliary_depth and FLAGS.auxiliary_odom and len(
                    self.target_odom) != 0:
                trgt = self.target_control[5]
                trgt_odom = [copy.deepcopy(self.target_odom)]
                with_loss = True
            elif len(
                    self.target_control
            ) != 0 and FLAGS.auxiliary_depth and len(
                    self.target_depth) != 0 and FLAGS.auxiliary_odom and len(
                        self.target_odom) != 0:
                trgt = self.target_control[5]
                trgt_odom = [copy.deepcopy(self.target_odom)]
                trgt_depth = [copy.deepcopy(self.target_depth)]
                with_loss = True
            if with_loss and False:  # for now skip calculating accumulated loses.
                prev_ctr = [[self.prev_control[0]]]
                control, self.state, losses, aux_results = self.model.forward(
                    [[im]] if FLAGS.lstm else [im],
                    states=self.state,
                    auxdepth=FLAGS.show_depth,
                    auxodom=FLAGS.show_odom,
                    prev_action=prev_ctr,
                    targets=[[trgt]],
                    target_depth=trgt_depth,
                    target_odom=trgt_odom)
                if len(self.accumlosses.keys()) == 0:
                    self.accumlosses = losses
                else:
                    # self.accumlosses=[self.accumlosses[i]+losses[i] for i in range(len(losses))]
                    for v in losses.keys():
                        self.accumlosses[v] = self.accumlosses[v] + losses[v]
            else:
                prev_ctr = [[self.prev_control[0]]]
                control, self.state, losses, aux_results = self.model.forward(
                    [[im]] if FLAGS.lstm else [im],
                    states=self.state,
                    auxdepth=FLAGS.show_depth,
                    auxodom=FLAGS.show_odom,
                    prev_action=prev_ctr)
            if FLAGS.show_depth and FLAGS.auxiliary_depth and len(
                    aux_results) > 0:
                self.aux_depth = aux_results['depth']
            if FLAGS.show_odom and FLAGS.auxiliary_odom and len(
                    aux_results) > 0:
                self.aux_odom = aux_results['odom']
        else:  ###TRAINING
            # Get necessary labels, if label is missing wait...
            if len(self.target_control) == 0:
                print('No target control')
                return
            else:
                trgt = self.target_control[5]
                # print(trgt)
            if (FLAGS.auxiliary_depth or FLAGS.rl) and len(
                    self.target_depth) == 0:
                print('No target depth')
                return
            else:
                trgt_depth = copy.deepcopy(self.target_depth)
                # self.target_depth = []
            if FLAGS.auxiliary_odom and (len(self.target_odom) == 0
                                         or len(self.prev_control) == 0):
                print('no target odometry or previous control')
                return
            else:
                trgt_odom = copy.deepcopy(self.target_odom)
            # check if depth image corresponds to rgb image
            # cv2.imshow('rgb', im)
            # cv2.waitKey(2)
            # cv2.imshow('depth', trgt_depth*1/5.)
            # cv2.waitKey(2)
            # ---------------------------------------------------------- DEPRECATED
            # if not FLAGS.experience_replay: ### TRAINING WITHOUT EXPERIENCE REPLAY
            #   if FLAGS.auxiliary_depth:
            #     control, losses = self.model.backward([im],[[trgt]], [[[trgt_depth]]])
            #   else:
            #     control, losses = self.model.backward([im],[[trgt]])
            #   print 'Difference: '+str(control[0,0])+' and '+str(trgt)+'='+str(abs(control[0,0]-trgt))
            #   self.accumlosses += losses[0]
            # else: ### TRAINING WITH EXPERIENCE REPLAY
            # wait for first target depth in case of auxiliary depth.
            # in case the network can predict the depth
            self.time_4 = time.time()
            prev_ctr = [[self.prev_control[0]]]

            control, self.state, losses, aux_results = self.model.forward(
                [[im]] if FLAGS.lstm else [im],
                states=self.state,
                auxdepth=FLAGS.show_depth,
                auxodom=FLAGS.show_odom,
                prev_action=prev_ctr)
            if FLAGS.show_depth and FLAGS.auxiliary_depth:
                self.aux_depth = aux_results['depth']
            if FLAGS.show_odom and FLAGS.auxiliary_odom:
                self.aux_odom = aux_results['odom']
            self.time_5 = time.time()
            # print 'state: ', self.state
        ### SEND CONTROL
        noise = self.exploration_noise.noise()
        # yaw = control[0,0]
        # if np.random.binomial(1,FLAGS.epsilon) and not FLAGS.evaluate:
        # yaw = max(-1,min(1,np.random.normal()))
        if trgt != 100 and not FLAGS.evaluate:
            action = trgt if np.random.binomial(1, FLAGS.alpha**
                                                self.run) else control[0, 0]
        else:
            action = control[0, 0]
        msg = Twist()
        if FLAGS.type_of_noise == 'ou':
            msg.linear.x = self.speed  #0.8 # 1.8 #
            # msg.linear.x = FLAGS.speed+(not FLAGS.evaluate)*FLAGS.sigma_x*noise[0] #0.8 # 1.8 #
            msg.linear.y = (not FLAGS.evaluate) * noise[1] * FLAGS.sigma_y
            msg.linear.z = (not FLAGS.evaluate) * noise[2] * FLAGS.sigma_z
            msg.angular.z = max(
                -1,
                min(1, action +
                    (not FLAGS.evaluate) * FLAGS.sigma_yaw * noise[3]))
        elif FLAGS.type_of_noise == 'uni':
            msg.linear.x = self.speed
            # msg.linear.x = FLAGS.speed + (not FLAGS.evaluate)*np.random.uniform(-FLAGS.sigma_x, FLAGS.sigma_x)
            msg.linear.y = (not FLAGS.evaluate) * np.random.uniform(
                -FLAGS.sigma_y, FLAGS.sigma_y)
            msg.linear.z = (not FLAGS.evaluate) * np.random.uniform(
                -FLAGS.sigma_z, FLAGS.sigma_z)
            msg.angular.z = max(
                -1,
                min(
                    1, action + (not FLAGS.evaluate) *
                    np.random.uniform(-FLAGS.sigma_yaw, FLAGS.sigma_yaw)))
        else:
            raise IOError('Type of noise is unknown: {}'.format(
                FLAGS.type_of_noise))
        self.action_pub.publish(msg)
        self.prev_control = [msg.angular.z]
        self.time_6 = time.time()
        if FLAGS.show_depth and len(self.aux_depth) != 0 and not self.finished:
            # print('shape aux depth: {}'.format(self.aux_depth.shape))
            self.aux_depth = self.aux_depth.flatten()
            self.depth_pub.publish(self.aux_depth)
            self.aux_depth = []
        if FLAGS.show_odom and len(self.aux_odom) != 0 and not self.finished:
            # trgt_odom = [copy.deepcopy(self.target_odom)]
            # final_img = cv2.hconcat((im[:,:,0:3], im[:,:,3:6],im[:,:,6:]))
            # final_img = cv2.hconcat((im[:,:,[2,1,0]], im[:,:,[5,4,3]],im[:,:,[8,7,6]]))
            # print trgt_odom
            # cv2.imshow('Final', final_img)
            # cv2.waitKey(100)
            # cv2.destroyAllWindows()
            concat_odoms = np.concatenate(
                (self.aux_odom.astype(np.float32).flatten(),
                 np.array(trgt_odom).astype(np.float32).flatten()))
            # self.odom_pub.publish(self.aux_odom.flatten())
            # print concat_odoms[4:6],' and ',concat_odoms[0:2]
            self.odom_pub.publish(concat_odoms.astype(np.float32))
            # self.odom_error.append(np.abs(np.array(trgt_odom).flatten()-self.aux_odom.flatten()))
            self.aux_odom = []

        # ADD EXPERIENCE REPLAY
        if FLAGS.experience_replay and not FLAGS.evaluate and trgt != -100:
            aux_info = {}
            if FLAGS.auxiliary_depth or FLAGS.rl:
                aux_info['target_depth'] = trgt_depth
            if FLAGS.auxiliary_odom:
                # print trgt_odom
                # print 'target odom ',trgt_odom
                aux_info['target_odom'] = trgt_odom
                aux_info['prev_action'] = prev_ctr
            if FLAGS.lstm:
                # aux_info['state']=(np.zeros(()))
                # state type:  <type 'tuple'>  len:  2  len sub:  2  len subsub:  1  len subsubsub:  100
                aux_info['state'] = self.state
                # aux_info['state']=((np.zeros((1,100)),np.zeros((1,100))+10),(np.ones((1,100)),np.ones((1,100))+20))
                # print aux_info['state']
                # (state layer0,output layer0,state layer1,output layer1)
                # print 'state type: ',type(aux_info['state']),' len: ', len(aux_info['state']),' len sub: ', len(aux_info['state'][0]),' len subsub: ', len(aux_info['state'][0][0]),' len subsubsub: ', len(self.state[0][0][0])
            self.replay_buffer.add(im, [trgt], aux_info=aux_info)

        self.time_7 = time.time()
        if FLAGS.save_input:
            self.depthfile = open(self.logfolder + '/depth_input', 'a')
            np.set_printoptions(precision=5)
            message = "{0} : {1} : {2:.4f} \n".format(
                self.run,
                ' '.join('{0:.5f}'.format(k) for k in np.asarray(im)), trgt)
            self.depthfile.write(message)
            self.depthfile.close()
        self.time_8 = time.time()
        # print 'processed image @: {0:.2f}'.format(time.time())

        # print("Time debugging: \n cvbridge: {0} , \n resize: {1}, \n copy: {2} , \n net pred: {3}, \n pub: {4},\n exp buf: {5},\n pos file: {6} s".format((self.time_2-self.time_1),
        # (self.time_3-self.time_2),(self.time_4-self.time_3),(self.time_5-self.time_4),(self.time_6-self.time_5),(self.time_7-self.time_6),(self.time_8-self.time_7)))
        # Delay values with auxiliary depth (at the beginning of training)
        # cv bridge (RGB): 0.0003s
        # resize (RGB): 0.0015s
        # copy control+depth: 2.7e-5 s
        # net prediction: 0.011s
        # publication: 0.0002s
        # fill experience buffer: 1.8e-5 s
        # write position: 2.1e-6 s

    def supervised_callback(self, data):
        if not self.ready: return
        self.target_control = [
            data.linear.x, data.linear.y, data.linear.z, data.angular.x,
            data.angular.y, data.angular.z
        ]

    def finished_callback(self, msg):
        if self.ready and not self.finished:
            # self.depth_pub.publish(self.aux_depth)
            print('neural control deactivated.')
            self.ready = False
            self.finished = True
            # Train model from experience replay:
            # Train the model with batchnormalization out of the image callback loop
            activation_images = []
            depth_predictions = []
            endpoint_activations = []
            tloss = []  #total loss
            closs = []  #control loss
            dloss = []  #depth loss
            oloss = []  #odometry loss
            qloss = []  #RL cost-to-go loss
            tlossm, clossm, dlossm, olossm, qlossm, tlossm_eva, clossm_eva, dlossm_eva, olossm_eva, qlossm_eva = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
            #tot_batch_loss = []
            if FLAGS.experience_replay and self.replay_buffer.size() > (
                    FLAGS.batch_size if not FLAGS.lstm else FLAGS.batch_size *
                    FLAGS.num_steps) and not FLAGS.evaluate:
                for b in range(
                        min(int(self.replay_buffer.size() / FLAGS.batch_size),
                            10)):
                    inputs, targets, aux_info = self.replay_buffer.sample_batch(
                        FLAGS.batch_size)
                    # import pdb; pdb.set_trace()
                    #print('time to smaple batch of images: ',time.time()-st)
                    if b == 0:
                        if FLAGS.plot_activations:
                            activation_images = self.model.plot_activations(
                                inputs, targets.reshape((-1, 1)))
                        if FLAGS.plot_depth and FLAGS.auxiliary_depth:
                            depth_predictions = self.model.plot_depth(
                                inputs,
                                aux_info['target_depth'].reshape(-1, 55, 74))
                        if FLAGS.plot_histograms:
                            endpoint_activations = self.model.get_endpoint_activations(
                                inputs)
                    init_state = []
                    depth_targets = []
                    odom_targets = []
                    prev_action = []
                    if FLAGS.lstm:
                        init_state = (aux_info['state'][:, 0, 0, 0, 0, :],
                                      aux_info['state'][:, 0, 0, 1, 0, :],
                                      aux_info['state'][:, 0, 1, 0, 0, :],
                                      aux_info['state'][:, 0, 1, 1, 0, :])
                        # if FLAGS.use_init_state:
                        #   init_state=
                        assert init_state[0].shape[0] == FLAGS.batch_size
                        # print 'init_state sizes ',init_state[0].shape
                    if FLAGS.auxiliary_depth or FLAGS.rl:
                        depth_targets = aux_info['target_depth'].reshape(
                            -1, 55, 74)
                        # depth_targets=aux_info['target_depth'].reshape(-1,55,74) if not FLAGS.lstm else aux_info['target_depth'].reshape(-1,FLAGS.num_steps, 55,74)
                    if FLAGS.auxiliary_odom:
                        odom_targets = aux_info['target_odom'].reshape(
                            -1, 4) if not FLAGS.lstm else aux_info[
                                'target_odom'].reshape(-1, FLAGS.num_steps, 4)
                        # odom_targets=aux_info['target_odom'].reshape(-1,6) if not FLAGS.lstm else aux_info['target_odom'].reshape(-1,FLAGS.num_steps, 6)
                        prev_action = aux_info['prev_action'].reshape(
                            -1, 1
                        )  #if not FLAGS.lstm else aux_info['prev_action'].reshape(-1,FLAGS.num_steps, 1)
                    # todo add initial state for each rollout in the batch
                    controls, losses = self.model.backward(
                        inputs, init_state, targets[:].reshape(-1, 1),
                        depth_targets, odom_targets, prev_action)
                    tloss = losses['t']
                    if not FLAGS.rl or FLAGS.auxiliary_ctr: closs = losses['c']
                    if FLAGS.auxiliary_depth: dloss.append(losses['d'])
                    if FLAGS.auxiliary_odom: oloss.append(losses['o'])
                    if FLAGS.rl: qloss.append(losses['q'])
                tlossm = np.mean(tloss)
                clossm = np.mean(
                    closs) if not FLAGS.rl or FLAGS.auxiliary_ctr else 0
                dlossm = np.mean(dloss) if FLAGS.auxiliary_depth else 0
                olossm = np.mean(oloss) if FLAGS.auxiliary_odom else 0
                qlossm = np.mean(qloss) if FLAGS.rl else 0
            else:
                print('Evaluating or filling buffer or no experience_replay: ',
                      self.replay_buffer.size())
                if 't' in self.accumlosses.keys():
                    tlossm_eva = self.accumlosses['t']
                if 'c' in self.accumlosses.keys():
                    clossm_eva = self.accumlosses['c']
                if 'd' in self.accumlosses.keys():
                    dlossm_eva = self.accumlosses['d']
                if 'o' in self.accumlosses.keys():
                    olossm_eva = self.accumlosses['o']
                if 'q' in self.accumlosses.keys():
                    qlossm_eva = self.accumlosses['q']

            if not FLAGS.evaluate:
                self.average_distance = self.average_distance - self.average_distance / (
                    self.run + 1)
                self.average_distance = self.average_distance + self.current_distance / (
                    self.run + 1)
            else:
                self.average_distance_eva = self.average_distance_eva - self.average_distance_eva / (
                    self.run_eva + 1)
                self.average_distance_eva = self.average_distance_eva + self.current_distance / (
                    self.run_eva + 1)

            odom_errx, odom_erry, odom_errz, odom_erryaw = 0, 0, 0, 0
            if len(self.odom_error) != 0:
                odom_errx = np.mean([e[0] for e in self.odom_error])
                odom_erry = np.mean([e[1] for e in self.odom_error])
                odom_errz = np.mean([e[2] for e in self.odom_error])
                odom_erryaw = np.mean([e[3] for e in self.odom_error])
            try:
                sumvar = {}
                # sumvar={k : 0 for k in self.model.summary_vars.keys()}
                sumvar["Distance_current_" +
                       self.world_name if len(self.world_name) != 0 else
                       "Distance_current"] = self.current_distance
                sumvar["Distance_furthest_" +
                       self.world_name if len(self.world_name) != 0 else
                       "Distance_furthest"] = self.furthest_point
                if FLAGS.evaluate:
                    sumvar["Distance_average_eva"] = self.average_distance_eva
                else:
                    sumvar["Distance_average"] = self.average_distance
                if tlossm != 0: sumvar["Loss_total"] = tlossm
                if clossm != 0: sumvar["Loss_control"] = clossm
                if dlossm != 0: sumvar["Loss_depth"] = dlossm
                if olossm != 0: sumvar["Loss_odom"] = olossm
                if qlossm != 0: sumvar["Loss_q"] = qlossm
                if tlossm_eva != 0: sumvar["Loss_total_eva"] = tlossm_eva
                if clossm_eva != 0: sumvar["Loss_control_eva"] = clossm_eva
                if dlossm_eva != 0: sumvar["Loss_depth_eva"] = dlossm_eva
                if olossm_eva != 0: sumvar["Loss_odom_eva"] = olossm_eva
                if qlossm_eva != 0: sumvar["Loss_q_eva"] = qlossm_eva
                if odom_errx != 0: sumvar["odom_errx"] = odom_errx
                if odom_erry != 0: sumvar["odom_erry"] = odom_erry
                if odom_errz != 0: sumvar["odom_errz"] = odom_errz
                if odom_erryaw != 0: sumvar["odom_erryaw"] = odom_erryaw
                if FLAGS.plot_activations and len(activation_images) != 0:
                    sumvar["conv_activations"] = activation_images
                    # sumvar.append(activation_images)
                if FLAGS.plot_depth and FLAGS.auxiliary_depth:
                    sumvar["depth_predictions"] = depth_predictions
                    # sumvar.append(depth_predictions)
                if FLAGS.plot_histograms:
                    for i, ep in enumerate(self.model.endpoints):
                        sumvar['activations_{}'.format(
                            ep)] = endpoint_activations[i]
                    # sumvar.extend(endpoint_activations)
                self.model.summarize(sumvar)
            except Exception as e:
                print('failed to write', e)
                pass
            else:
                print(
                    '{0}: control finished {1}:[ current_distance: {2:0.3f}, average_distance: {3:0.3f}, furthest point: {4:0.1f}, total loss: {5:0.3f}, control loss: {6:0.3e}, depth loss: {7:0.3e}, odom loss: {8:0.3e}, q loss: {9:0.3e}, world: {10}'
                    .format(
                        time.strftime('%H:%M'),
                        self.run if not FLAGS.evaluate else self.run_eva,
                        self.current_distance, self.average_distance
                        if not FLAGS.evaluate else self.average_distance_eva,
                        self.furthest_point,
                        tlossm if not FLAGS.evaluate else tlossm_eva,
                        clossm if not FLAGS.evaluate else clossm_eva,
                        dlossm if not FLAGS.evaluate else dlossm_eva,
                        olossm if not FLAGS.evaluate else olossm_eva,
                        qlossm if not FLAGS.evaluate else qlossm_eva,
                        self.world_name))
                l_file = open(self.logfile, 'a')
                tag = 'train'
                if FLAGS.evaluate:
                    tag = 'val'
                l_file.write(
                    '{0} {1} {2} {3} {4} {5} {6} {7} {8} {9} {10}\n'.format(
                        self.run if not FLAGS.evaluate else self.run_eva,
                        self.current_distance, self.average_distance
                        if not FLAGS.evaluate else self.average_distance_eva,
                        self.furthest_point, tlossm, clossm, dlossm, olossm,
                        qlossm, tag, self.world_name))
                l_file.close()
            self.accumlosses = {}
            self.maxy = -10
            self.current_distance = 0
            self.last_pose = []
            self.nfc_images = []
            self.nfc_poses = []
            self.furthest_point = 0
            if FLAGS.lstm and not FLAGS.evaluate: self.replay_buffer.new_run()
            self.world_name = ''
            if self.run % 10 == 0 and not FLAGS.evaluate:
                # Save a checkpoint every 20 runs.
                self.model.save(self.logfolder)
            self.state = []
            if not FLAGS.evaluate:
                self.run += 1
            else:
                self.run_eva += 1
            # wait for gzserver to be killed
            gzservercount = 1
            while gzservercount > 0:
                #print('gzserver: ',gzservercount)
                gzservercount = os.popen("ps -Af").read().count('gzserver')
                time.sleep(0.1)
            sys.stdout.flush()

예제 #12

파일 보기

class DDPG:
    """docstring for DDPG"""
    def __init__(self, environment):
        self.name = 'DDPG'  # name for uploading results
        self.environment = environment
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.actor_network = ActorNetwork(
            state_size=environment.observation_space.shape[0],
            action_size=environment.action_space.shape[0])
        self.critic_network = CriticNetwork(
            state_size=environment.observation_space.shape[0],
            action_size=environment.action_space.shape[0])
        # initialize replay buffer
        self.replay_buffer = deque()

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(environment.action_space.shape[0])

        # Initialize time step
        self.time_step = 0

    def set_init_observation(self, observation):
        # receive initial observation state
        self.state = observation

    def train(self):
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = random.sample(self.replay_buffer, BATCH_SIZE)
        state_batch = [data[0] for data in minibatch]
        action_batch = [data[1] for data in minibatch]
        reward_batch = [data[2] for data in minibatch]
        next_state_batch = [data[3] for data in minibatch]

        action_batch = np.resize(action_batch, [BATCH_SIZE, 1])

        # Calculate y
        y_batch = []
        next_action_batch = self.actor_network.target_evaluate(
            next_state_batch)
        q_value_batch = self.critic_network.target_evaluate(
            next_state_batch, next_action_batch)
        for i in range(0, BATCH_SIZE):
            done = minibatch[i][4]
            if done:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.evaluate(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients) / BATCH_SIZE

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def get_action(self):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.get_action(self.state)
        return np.clip(action + self.exploration_noise.noise(),
                       self.environment.action_space.low,
                       self.environment.action_space.high)

    def set_feedback(self, observation, action, reward, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        next_state = observation
        self.replay_buffer.append(
            (self.state, action, reward, next_state, done))
        # Update current state
        self.state = next_state
        # Update time step
        self.time_step += 1

        # Limit the replay buffer size
        if len(self.replay_buffer) > REPLAY_BUFFER_SIZE:
            self.replay_buffer.popleft()

        # Store transitions to replay start size then start training
        if self.time_step > REPLAY_START_SIZE:
            self.train()

        if self.time_step % 10000 == 0:
            self.actor_network.save_network(self.time_step)
            self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

예제 #13

파일 보기

class MaDDPG:
    def __init__(self, num_agents, state_dim, action_dim):
        # track training times
        self.time_step = 0
        # use set session use GPU
        #self.sess = tf.InteractiveSession()
        self.sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=True))
        self.num_agents = num_agents
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.agents = self.create_multi_agents(self.sess, num_agents,
                                               self.state_dim, self.action_dim)
        # make sure create Criticnetwork later, summarise mean Q value inside
        self.critic = CriticNetwork(self.sess, state_dim, action_dim)
        self.exploration_noise = OUNoise((self.num_agents, action_dim))
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        # for store checkpoint
        self.saver = tf.train.Saver()

    def train(self):
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.zeros((BATCH_SIZE, self.num_agents, self.state_dim))
        action_batch = np.zeros((BATCH_SIZE, self.num_agents, self.action_dim))
        reward_batch = np.zeros((BATCH_SIZE, self.num_agents))
        next_state_batch = np.zeros(
            (BATCH_SIZE, self.num_agents, self.state_dim))
        done_batch = np.zeros((BATCH_SIZE))
        for ii in range(BATCH_SIZE):
            state_batch[ii, :, :] = minibatch[ii][0]
            action_batch[ii, :, :] = minibatch[ii][1]
            reward_batch[ii, :] = minibatch[ii][2]
            next_state_batch[ii, :, :] = minibatch[ii][3]
            done_batch[ii] = minibatch[ii][4]

        # calculate Gt batch
        next_action_batch = self.target_actions(next_state_batch)
        q_value_batch = self.critic.target_q(next_state_batch,
                                             next_action_batch)
        gt = np.zeros((BATCH_SIZE, self.num_agents))
        for ii in range(BATCH_SIZE):
            if done_batch[ii]:
                gt[ii, :] = reward_batch[ii, :]
            else:
                gt[ii, :] = reward_batch[ii, :] + GAMMA * q_value_batch[ii, :]
        #update critic by minimizing the loss
        self.critic.train(gt, state_batch, action_batch)

        # update policy using the sampling gradients
        actions_for_grad = self.actions(state_batch)
        q_gradients_batch = self.critic.gradients(state_batch,
                                                  actions_for_grad)
        self.train_agents(q_gradients_batch, state_batch)

        # update critic target network
        self.critic.update_target()

        # update actor target
        self.update_agents_target()

    def summary(self, record_num):
        if self.replay_buffer.count() > SUMMARY_BATCH_SIZE:
            mini_batch = self.replay_buffer.popn(SUMMARY_BATCH_SIZE)
            state_batch = np.zeros(
                (SUMMARY_BATCH_SIZE, self.num_agents, self.state_dim))
            for ii in range(SUMMARY_BATCH_SIZE):
                state_batch[ii, :, :] = mini_batch[ii][0]

            actions_for_summary = self.actions(state_batch)
            self.critic.write_summaries(state_batch, actions_for_summary,
                                        record_num)

    def update_agents_target(self):
        for agent in self.agents:
            agent.update_target()

    def train_agents(self, gradients_batch, state_batch):
        # gradients_batch = [batchsize* agents* action_dim]
        # state_batch = [batchsize* agents * state_dim ]
        for ii in range(self.num_agents):
            grad = gradients_batch[:, ii, :]
            state = state_batch[:, ii, :]
            self.agents[ii].train(grad, state)

    def create_multi_agents(self, sess, num_agents, state_dim, action_dim):
        agents = []
        nets = None
        for ii in range(num_agents):
            agent_name = 'agent' + str(ii)
            agents.append(
                ActorNetwork(sess, state_dim, action_dim, agent_name, nets))
            nets = agents[-1].nets
        return agents

    def add_agents(self, add_num):
        for ii in range(add_num):
            #self.num_agents+=1

            agent_name = 'agent' + str(self.num_agents)
            self.agents.append(
                ActorNetwork(self.sess, self.state_dim, self.action_dim,
                             agent_name, self.agents[-1].nets))
            # the agents' name is from 0-num_agents-1
            self.num_agents += 1

        # if add a new agent then reset the noise and replay buffer
        self.exploration_noise = OUNoise((self.num_agents, self.action_dim))
        #self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        self.replay_buffer.erase()
        # re-create a saver
        # the new saver will contains all the savable variables.
        # otherwise only contains the initially created agents
        self.saver = tf.train.Saver()
        # reset the time step
        # self.time_step = 0

    def action(
        self, state
    ):  # here is action, for one state on agent, not batch_sized actions
        # state = [num_agents * state_dim]
        # actions = [num_agents *  action_dim]
        action = np.zeros((self.num_agents, self.action_dim))
        for ii in range(self.num_agents):
            action[ii, :] = self.agents[ii].action(state[ii, :])
        return action

    def actions(self, state_batch):
        #state = batch_size*numOfagents*state_dim
        #actions = batch_size*numOfagents*action_dim
        batch_size = state_batch.shape[0]
        actions = np.zeros((batch_size, self.num_agents, self.action_dim))
        for ii in range(self.num_agents):
            actions[:, ii, :] = self.agents[ii].actions(state_batch[:, ii, :])
        return actions

    def target_actions(self, state_batch):
        # the state size  is batch_size* num_agents * state_dimension
        actions = np.zeros(
            (state_batch.shape[0], self.num_agents, self.action_dim))
        for ii in range(self.num_agents):
            actions[:,
                    ii, :] = self.agents[ii].target_actions(state_batch[:,
                                                                        ii, :])
        return actions

    def noise_action(self, state):
        action = self.action(state)
        # clip the action, action \in [-1,+1]
        return np.clip(action + self.exploration_noise.noise(), -1, 1)

    def close_session(self):
        self.sess.close()

    def perceive(self, state, action, reward, next_state, done):
        # store {st,at,Rt+1,st+1}
        self.replay_buffer.add(state, action, reward, next_state, done)

        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.time_step += 1
            self.train()
            if self.time_step % SAVE_STEPS == 0:
                self.save_network()
            # if self.time_step % 10000 == 0:
            # self.actor_network.save_network(self.time_step)
            # self.critic_network.save_network(self.time_step)

            # Re-initialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def load_network(self):
        checkpoint = tf.train.get_checkpoint_state("saved_network")
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print("Successfully loaded:", checkpoint.model_checkpoint_path)
        else:
            print('Could not find old network weights')

    def save_network(self):
        # do not processing under Dropbox
        #  exit drop box then run
        print('save network...', self.time_step)
        self.saver.save(self.sess,
                        'saved_network/' + 'network',
                        global_step=self.time_step)

예제 #14

파일 보기

파일: ddpg.py 프로젝트: JakobBreuninger/neurobotics

class DDPG:

    def __init__(self):

        # Make sure all the directories exist
        if not tf.gfile.Exists(TFLOG_PATH):
            tf.gfile.MakeDirs(TFLOG_PATH)
        if not tf.gfile.Exists(EXPERIENCE_PATH):
            tf.gfile.MakeDirs(EXPERIENCE_PATH)
        if not tf.gfile.Exists(NET_SAVE_PATH):
            tf.gfile.MakeDirs(NET_SAVE_PATH)

        # Initialize our session
        self.session = tf.Session()
        self.graph = self.session.graph

        with self.graph.as_default():

            # View the state batches
            self.visualize_input = VISUALIZE_BUFFER
            if self.visualize_input:
                self.viewer = CostmapVisualizer()

            # Hardcode input size and action size
            self.height = 86
            self.width = self.height
            self.depth = 4
            self.action_dim = 2

            # Initialize the current action and the old action and old state for setting experiences
            self.old_state = np.zeros((self.width, self.height, self.depth), dtype='int8')
            self.old_action = np.ones(2, dtype='float')
            self.network_action = np.zeros(2, dtype='float')
            self.noise_action = np.zeros(2, dtype='float')
            self.action = np.zeros(2, dtype='float')

            # Initialize the grad inverter object to keep the action bounds
            self.grad_inv = GradInverter(A0_BOUNDS, A1_BOUNDS, self.session)

            # Make sure the directory for the data files exists
            if not tf.gfile.Exists(DATA_PATH):
                tf.gfile.MakeDirs(DATA_PATH)

            # Initialize summary writers to plot variables during training
            self.summary_op = tf.merge_all_summaries()
            self.summary_writer = tf.train.SummaryWriter(TFLOG_PATH)

            # Initialize actor and critic networks
            self.actor_network = ActorNetwork(self.height, self.action_dim, self.depth, self.session,
                                              self.summary_writer)
            self.critic_network = CriticNetwork(self.height, self.action_dim, self.depth, self.session,
                                                self.summary_writer)

            # Initialize the saver to save the network params
            self.saver = tf.train.Saver()

            # initialize the experience data manger
            self.data_manager = DataManager(BATCH_SIZE, EXPERIENCE_PATH, self.session)

            # Uncomment if collecting a buffer for the autoencoder
            # self.buffer = deque()

            # Should we load the pre-trained params?
            # If so: Load the full pre-trained net
            # Else:  Initialize all variables the overwrite the conv layers with the pretrained filters
            if PRE_TRAINED_NETS:
                self.saver.restore(self.session, NET_LOAD_PATH)
            else:
                self.session.run(tf.initialize_all_variables())

            tf.train.start_queue_runners(sess=self.session)
            time.sleep(1)

            # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
            self.exploration_noise = OUNoise(self.action_dim, MU, THETA, SIGMA)
            self.noise_flag = True

            # Initialize time step
            self.training_step = 0

            # Flag: don't learn the first experience
            self.first_experience = True

            # After the graph has been filled add it to the summary writer
            self.summary_writer.add_graph(self.graph)

    def train(self):

        # Check if the buffer is big enough to start training
        if self.data_manager.enough_data():

            # get the next random batch from the data manger
            state_batch, \
                action_batch, \
                reward_batch, \
                next_state_batch, \
                is_episode_finished_batch = self.data_manager.get_next_batch()

            state_batch = np.divide(state_batch, 100.0)
            next_state_batch = np.divide(next_state_batch, 100.0)

            # Are we visualizing the first state batch for debugging?
            # If so: We have to scale up the values for grey scale before plotting
            if self.visualize_input:
                state_batch_np = np.asarray(state_batch)
                state_batch_np = np.multiply(state_batch_np, -100.0)
                state_batch_np = np.add(state_batch_np, 100.0)
                self.viewer.set_data(state_batch_np)
                self.viewer.run()
                self.visualize_input = False

            # Calculate y for the td_error of the critic
            y_batch = []
            next_action_batch = self.actor_network.target_evaluate(next_state_batch)
            q_value_batch = self.critic_network.target_evaluate(next_state_batch, next_action_batch)

            for i in range(0, BATCH_SIZE):
                if is_episode_finished_batch[i]:
                    y_batch.append([reward_batch[i]])
                else:
                    y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

            # Now that we have the y batch lets train the critic
            self.critic_network.train(y_batch, state_batch, action_batch)

            # Get the action batch so we can calculate the action gradient with it
            # Then get the action gradient batch and adapt the gradient with the gradient inverting method
            action_batch_for_gradients = self.actor_network.evaluate(state_batch)
            q_gradient_batch = self.critic_network.get_action_gradient(state_batch, action_batch_for_gradients)
            q_gradient_batch = self.grad_inv.invert(q_gradient_batch, action_batch_for_gradients)

            # Now we can train the actor
            self.actor_network.train(q_gradient_batch, state_batch)

            # Save model if necessary
            if self.training_step > 0 and self.training_step % SAVE_STEP == 0:
                self.saver.save(self.session, NET_SAVE_PATH, global_step=self.training_step)

            # Update time step
            self.training_step += 1

        self.data_manager.check_for_enqueue()

    def get_action(self, state):

        # normalize the state
        state = state.astype(float)
        state = np.divide(state, 100.0)

        # Get the action
        self.action = self.actor_network.get_action(state)

        # Are we using noise?
        if self.noise_flag:
            # scale noise down to 0 at training step 3000000
            if self.training_step < MAX_NOISE_STEP:
                self.action += (MAX_NOISE_STEP - self.training_step) / MAX_NOISE_STEP * self.exploration_noise.noise()
            # if action value lies outside of action bounds, rescale the action vector
            if self.action[0] < A0_BOUNDS[0] or self.action[0] > A0_BOUNDS[1]:
                self.action *= np.fabs(A0_BOUNDS[0]/self.action[0])
            if self.action[1] < A0_BOUNDS[0] or self.action[1] > A0_BOUNDS[1]:
                self.action *= np.fabs(A1_BOUNDS[0]/self.action[1])

        # Life q value output for this action and state
        self.print_q_value(state, self.action)

        return self.action

    def set_experience(self, state, reward, is_episode_finished):

        # Make sure we're saving a new old_state for the first experience of every episode
        if self.first_experience:
            self.first_experience = False
        else:
            self.data_manager.store_experience_to_file(self.old_state, self.old_action, reward, state,
                                                       is_episode_finished)

            # Uncomment if collecting data for the auto_encoder
            # experience = (self.old_state, self.old_action, reward, state, is_episode_finished)
            # self.buffer.append(experience)

        if is_episode_finished:
            self.first_experience = True
            self.exploration_noise.reset()

        # Safe old state and old action for next experience
        self.old_state = state
        self.old_action = self.action

    def print_q_value(self, state, action):

        string = "-"
        q_value = self.critic_network.evaluate([state], [action])
        stroke_pos = 30 * q_value[0][0] + 30
        if stroke_pos < 0:
            stroke_pos = 0
        elif stroke_pos > 60:
            stroke_pos = 60
        print '[' + stroke_pos * string + '|' + (60-stroke_pos) * string + ']', "Q: ", q_value[0][0], \
            "\tt: ", self.training_step

예제 #15

파일 보기

def s2l():
    
    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer
    num_states = feature_size   #num_states = env.observation_space.shape[0]
    num_actions = num_controls
    print ("Number of States:", num_states)
    print ("Number of Actions:", num_actions)
    action_space_high=[1.5] #[0.0,0.0,0.0]
    action_space_low=[0.03] #[0.5,0.5,0.5]
    print ("Action space highest values", action_space_high)
    print ("Action space lowest values:", action_space_low)
    robot=RoboControl()
    #while True:
    #    #robot.check()
    #    robot.publish_control([1])
    #    robot.reset() 

    agent = DDPG(is_batch_norm,num_states,num_actions,action_space_high,action_space_low)
    exploration_noise = OUNoise(num_actions)
    counter=0
    total_reward=0
    print ("Number of Rollouts per episode:", num_rollouts)
    print ("Number of Steps per roll out:", steps)
    reward_st = np.array([0])  #saving reward
    eval_metric_st= np.array([0])
    reward_st_all = np.array([0])  #saving reward after every step

    activity_obj=Vid_Feature()
    demo_vid_array=demo_array_extractor(demo_folder)
    demo_features=activity_obj.feature_extractor(demo_vid_array)

    frame_obj=Frame_Feature()
    #camera_obj= Camera()
    camera_obj= CameraSub()

    for episode in range(num_episodes):
        print ("==== Starting episode no:",episode,"====","\n")


        robot.reset()   # Reset env in the begining of each episode
        obs_img=camera_obj.camera_subscribe()   # Get the observation
        #obs_img=np.array(misc.imresize(obs_img,[112,112,3]))
        observation =np.array(frame_obj.frame_feature_extractor(obs_img))
        observation=observation.reshape(-1)
        reward_per_episode = 0

        for t in range(num_rollouts):

            reward_per_rollout=0
            vid_robo_=[]

            for i in range(steps):

                x = observation

                action = agent.evaluate_actor(np.reshape(x,[1,num_states]))
                noise = exploration_noise.noise()
                action = action[0] + noise #Select action according to current policy and exploration noise
                print ('Action at episode-',episode,'rollout-',t, 'step-', i ," :",action)


                robot.publish_control(action)
                
                obs_robo=camera_obj.camera_subscribe()   # Get the observation
            
                #obs_robo=misc.imresize(obs_robo,[112,112,3])
                vid_robo_.append(obs_robo)
                observation=np.array(frame_obj.frame_feature_extractor(np.array(obs_robo)))
                observation=observation.reshape(-1)
                #pasue()

                if(i==15):
                    vid_robo=np.array(vid_robo_)
                    robo_features=activity_obj.feature_extractor(vid_robo)
                    reward=-(distance(demo_features,robo_features))
                    reward=np.array(reward)
                    print('reward: ',reward)
                else:
                    reward=0
                    reward=np.array(reward)
                    print('reward: ',reward)

                # Storing reward after every rollout
                reward_st_all = np.append(reward_st_all,reward)
                np.savetxt('reward_all.txt',reward_st_all, newline="\n")

                #add s_t,s_t+1,action,reward to experience memory
                agent.add_experience(x,observation,action,reward,False)
                reward_per_rollout+=reward
                counter+=1

            #train critic and actor network
            if counter > start_training:
                    agent.train()
            print ('\n\n')

            #Saving policy
            if ((episode%100)==0 and t==num_rollouts-1):
                print('saving policy...........................!')
                agent.save_actor(episode)


            reward_per_episode+=reward_per_rollout

        #check if episode ends:

        print ('EPISODE: ',episode,' Total Reward: ',reward_per_episode)
        print ("Printing reward to file")
        exploration_noise.reset() #reinitializing random noise for action exploration
        reward_st = np.append(reward_st,reward_per_episode)
        np.savetxt('episode_reward.txt',reward_st, fmt='%f', newline="\n")
        print ('\n\n')

        total_reward+=reward_per_episode

    print ("Average reward per episode {}".format(total_reward / num_episodes))

예제 #16

파일 보기

파일: ddpg.py 프로젝트: titi2338432/RDPG-Biped

class DDPG:
    """docstring for DDPG"""
    def __init__(self, env, DIRECTORY):
        self.batch_size = BATCH_SIZE
        self.replay_start_size = REPLAY_START_SIZE  # self.sub_batch_size = BATCH_SIZE / n_gpu

        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession(config=tf.ConfigProto(
            allow_soft_placement=True, log_device_placement=False))

        self.trace_length = TRACE_LENGTH
        self.temp_abstract = TEMP_ABSTRACT
        self.actor_network = ActorNetwork(self.sess, BATCH_SIZE,
                                          self.state_dim, self.action_dim,
                                          self.temp_abstract, DIRECTORY)
        self.critic_network = CriticNetwork(self.sess, BATCH_SIZE,
                                            self.state_dim, self.action_dim,
                                            self.temp_abstract, DIRECTORY)

        # initialize replay buffer
        max_len_trajectory = self.environment.spec.timestep_limit + 1  # trace_length
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE, DIRECTORY,
                                          max_len_trajectory,
                                          self.actor_network.last_epi)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        ###
        self.diff = 0.
        self.discounting_mat_dict = {}
        ###

    def state_initialiser(self, shape, mode='g'):
        if mode == 'z':  #Zero
            initial = np.zeros(shape=shape)
        elif mode == 'g':  #Gaussian
            # initial = stats.truncnorm.rvs(a=-0.02/0.01,b=0.02/0.01,loc=0.,scale=0.01,size=shape)
            initial = np.random.normal(loc=0.,
                                       scale=1. / float(shape[1]),
                                       size=shape)
        else:  # May do some adaptive initialiser can be built in later
            raise NotImplementedError
        return initial

    def train(self, time_step):  #,time_step):
        ###1) Get-batch data for opt
        minibatch, trace_length = self.replay_buffer.get_batch(
            self.batch_size, self.trace_length,
            time_step)  #, self.trace_length)
        try:
            state_trace_batch = np.stack(minibatch[:, :, 2].ravel()).reshape(
                self.batch_size, trace_length, self.state_dim)
            action_trace_batch = np.stack(minibatch[:, :, 3].ravel()).reshape(
                self.batch_size, trace_length, self.action_dim)

            next_state_batch = np.stack(minibatch[:, -1, 6].ravel()).reshape(
                self.batch_size, 1, self.state_dim)
            next_state_trace_batch = np.concatenate(
                [state_trace_batch, next_state_batch], axis=1)

            reward_trace_batch = np.stack(minibatch[:, :, 4].ravel()).reshape(
                self.batch_size, trace_length, 1)
            done_trace_batch = np.stack(minibatch[:, :, 7].ravel()).reshape(
                self.batch_size, trace_length, 1)

        except Exception as e:
            print(str(e))
            raise

        ###2) Painfully initialise initial memories of LSTMs: not super-efficient, but no error guaranteed from tf's None-type zero-state problem
        init_actor_hidden1_cORm_batch = self.state_initialiser(
            shape=(self.batch_size, self.actor_network.rnn_size), mode='z')
        actor_init_h_batch = (
            init_actor_hidden1_cORm_batch, init_actor_hidden1_cORm_batch
        )  #((init_hidden1_cORm_batch,init_hidden1_cORm_batch),(init_actor_hidden2_cORm_batch,init_actor_hidden2_cORm_batch))

        init_critic_hidden1_cORm_batch = self.state_initialiser(
            shape=(self.batch_size, self.critic_network.rnn_size), mode='z')
        critic_init_h_batch = (
            init_critic_hidden1_cORm_batch, init_critic_hidden1_cORm_batch
        )  #,(init_critic_hidden3_cORm_batch,init_critic_hidden3_cORm_batch))
        ###

        self.dt_list = np.zeros(shape=(15, ))
        self.dt_list[-1] = time.time()
        if trace_length <= OPT_LENGTH:
            target_actor_init_h_batch = actor_init_h_batch
            target_critic_init_h_batch = critic_init_h_batch
            pass
        else:
            ### memory stuff
            actor_init_h_batch = self.actor_network.action(
                state_trace_batch[:, :-OPT_LENGTH, :],
                actor_init_h_batch,
                mode=1)
            target_actor_init_h_batch = actor_init_h_batch
            critic_init_h_batch = self.critic_network.evaluation(
                state_trace_batch[:, :-OPT_LENGTH, :],
                action_trace_batch[:, :-OPT_LENGTH, :],
                critic_init_h_batch,
                mode=1)
            target_critic_init_h_batch = critic_init_h_batch

            state_trace_batch = state_trace_batch[:, -OPT_LENGTH:, :]
            next_state_trace_batch = next_state_trace_batch[:, -(OPT_LENGTH +
                                                                 1):, :]
            action_trace_batch = action_trace_batch[:, -OPT_LENGTH:, :]
            reward_trace_batch = reward_trace_batch[:, -OPT_LENGTH:, :]
            done_trace_batch = done_trace_batch[:, -OPT_LENGTH:, :]
        self.dt_list[0] = time.time() - np.sum(self.dt_list)

        ###3) Obtain target output
        next_action_batch = self.actor_network.target_action(
            next_state_trace_batch,
            init_temporal_hidden_cm_batch=target_actor_init_h_batch)
        self.dt_list[1] = time.time() - np.sum(self.dt_list)
        next_action_trace_batch = np.concatenate(
            [action_trace_batch,
             np.expand_dims(next_action_batch, axis=1)],
            axis=1)
        self.dt_list[2] = time.time() - np.sum(self.dt_list)
        target_lastQ_batch = self.critic_network.target_q_trace(
            next_state_trace_batch,
            next_action_trace_batch,
            init_temporal_hidden_cm_batch=target_critic_init_h_batch)
        self.dt_list[3] = time.time() - np.sum(self.dt_list)

        # Control the length of time-step for gradient
        if trace_length <= OPT_LENGTH:
            update_length = np.minimum(
                trace_length,
                OPT_LENGTH // 1)  #//denom: 2(opt1) #1(opt0) #OPT_LENGTH(opt2)
        else:
            update_length = OPT_LENGTH // 1  #//denom: 2(opt1) #1(opt0) #OPT_LENGTH(opt2)

        target_lastQ_batch_masked = target_lastQ_batch * (
            1. - done_trace_batch[:, -1])
        rQ = np.concatenate([
            np.squeeze(reward_trace_batch[:, -update_length:], axis=-1),
            target_lastQ_batch_masked
        ],
                            axis=1)
        self.dt_list[4] = time.time() - np.sum(self.dt_list)

        try:
            discounting_mat = self.discounting_mat_dict[update_length]
        except KeyError:
            discounting_mat = np.zeros(shape=(update_length,
                                              update_length + 1),
                                       dtype=np.float)
            for i in range(update_length):
                discounting_mat[i, :i] = 0.
                discounting_mat[i,
                                i:] = GAMMA**np.arange(0.,
                                                       -i + update_length + 1)
            discounting_mat = np.transpose(discounting_mat)
            self.discounting_mat_dict[update_length] = discounting_mat
        try:
            y_trace_batch = np.expand_dims(np.matmul(rQ, discounting_mat),
                                           axis=-1)
        except Exception as e:
            print('?')
            raise
        self.dt_list[5] = time.time() - np.sum(self.dt_list)

        ###4)Train Critic: get next_action, target_q, then optimise
        critic_grad = self.critic_network.train(
            y_trace_batch,
            update_length,
            state_trace_batch,
            action_trace_batch,
            init_temporal_hidden_cm_batch=critic_init_h_batch)
        self.dt_list[6] = time.time() - np.sum(self.dt_list)

        ###5) Train Actor: while updated critic, we declared the dQda. Hence sess,run(dQda*dadParam_actor), then optimise actor
        for i in range(update_length):
            actor_init_h_batch_trace = (np.expand_dims(actor_init_h_batch[0],
                                                       axis=1),
                                        np.expand_dims(actor_init_h_batch[1],
                                                       axis=1))
            critic_init_h_batch_trace = (np.expand_dims(critic_init_h_batch[0],
                                                        axis=1),
                                         np.expand_dims(critic_init_h_batch[1],
                                                        axis=1))
            if i == 0:
                actor_init_h_batch_stack = actor_init_h_batch_trace
                critic_init_h_batch_stack = critic_init_h_batch_trace
            else:
                actor_init_h_batch_stack = (np.concatenate(
                    (actor_init_h_batch_stack[0], actor_init_h_batch_trace[0]),
                    axis=1),
                                            np.concatenate(
                                                (actor_init_h_batch_stack[1],
                                                 actor_init_h_batch_trace[1]),
                                                axis=1))
                critic_init_h_batch_stack = (
                    np.concatenate((critic_init_h_batch_stack[0],
                                    critic_init_h_batch_trace[0]),
                                   axis=1),
                    np.concatenate((critic_init_h_batch_stack[1],
                                    critic_init_h_batch_trace[1]),
                                   axis=1))
            action_trace_batch_for_gradients, actor_init_h_batch = self.actor_network.action_trace(
                np.expand_dims(state_trace_batch[:, i], 1),
                init_temporal_hidden_cm_batch=actor_init_h_batch)
            critic_init_h_batch = self.critic_network.evaluation_trace(
                np.expand_dims(state_trace_batch[:, i], 1),
                np.expand_dims(action_trace_batch[:, i], 1),
                init_temporal_hidden_cm_batch=critic_init_h_batch)
            if i == 0:
                action_trace_batch_for_gradients_stack = action_trace_batch_for_gradients
            else:
                action_trace_batch_for_gradients_stack = np.concatenate(
                    (action_trace_batch_for_gradients_stack,
                     action_trace_batch_for_gradients),
                    axis=1)

        self.dt_list[7] = time.time() - np.sum(self.dt_list)
        state_trace_batch_stack = np.reshape(
            state_trace_batch,
            (self.batch_size * update_length, 1, self.state_dim))
        action_trace_batch_stack = np.reshape(
            action_trace_batch,
            (self.batch_size * update_length, 1, self.action_dim))
        action_trace_batch_for_gradients_stack = np.reshape(
            action_trace_batch_for_gradients_stack,
            (self.batch_size * update_length, 1, self.action_dim))
        actor_init_h_batch_stack = (np.reshape(
            actor_init_h_batch_stack[0],
            (self.batch_size * update_length, self.actor_network.rnn_size)),
                                    np.reshape(
                                        actor_init_h_batch_stack[1],
                                        (self.batch_size * update_length,
                                         self.actor_network.rnn_size)))
        critic_init_h_batch_stack = (np.reshape(
            critic_init_h_batch_stack[0],
            (self.batch_size * update_length, self.critic_network.rnn_size)),
                                     np.reshape(
                                         critic_init_h_batch_stack[1],
                                         (self.batch_size * update_length,
                                          self.critic_network.rnn_size)))

        q_gradient_trace_batch = self.critic_network.gradients(
            1,
            state_trace_batch_stack,
            action_trace_batch_for_gradients_stack,
            init_temporal_hidden_cm_batch=critic_init_h_batch_stack)
        self.dt_list[8] = time.time() - np.sum(self.dt_list)

        # Update the actor policy using the sampled gradient:
        actor_grad = self.actor_network.train(
            q_gradient_trace_batch,
            1,
            state_trace_batch_stack,
            action_trace_batch_stack,
            init_temporal_hidden_cm_batch=actor_init_h_batch_stack)
        self.dt_list[9] = time.time() - np.sum(self.dt_list)

        # Update the target networks via EMA & Indicators
        # self.critic_network.update_target()
        self.dt_list[10] = time.time() - np.sum(self.dt_list)
        # self.actor_network.update_target()
        self.dt_list[11] = time.time() - np.sum(self.dt_list)

        # actor_diff = self.actor_network.get_diff()
        self.dt_list[12] = time.time() - np.sum(self.dt_list)
        # critic_diff = self.critic_network.get_diff()
        self.dt_list[13] = time.time() - np.sum(self.dt_list)

        self.dt_list = np.delete(self.dt_list, -1)
        return actor_grad, critic_grad,  # actor_diff, actor_grad, critic_diff, critic_grad

    def action(self, state_trace, init_hidden_cm, epi, noisy=True):
        # Select action a_t according to the current policy and exploration noise
        action, last_hidden_cm = self.actor_network.action([state_trace],
                                                           init_hidden_cm,
                                                           mode=2)
        if noisy:
            noise = self.exploration_noise.noise()  #epi)
            return action + noise, last_hidden_cm  #, dt#, np.linalg.norm(noise)
        else:
            return action, last_hidden_cm

    def evaluation(self, state_trace, action_trace, action_last,
                   init_hidden_cm):
        return self.critic_network.evaluation([state_trace], [action_trace],
                                              action_last,
                                              init_hidden_cm,
                                              mode=2)  #q_value, last_hidden_cm

    # def perceive(self,actor_init_hidden_cm,critic_last_hidden_cm,state,action,reward,next_state,done,time_step,epi):
    def perceive(self, state, action, reward, next_state, done, time_step,
                 epi):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        # self.replay_buffer.add(actor_init_hidden_cm,critic_last_hidden_cm,state,action,reward,next_state,done,epi)
        done = float(done)
        self.replay_buffer.add(state, action, reward, next_state, done, epi,
                               time_step)

        # Store transitions to replay start size then start training
        if (self.replay_buffer.num_experiences > REPLAY_START_SIZE):
            # Non-zero diff should be found
            self.actor_grad, self.critic_grad = self.train(time_step)
            # self.actor_diff, self.actor_grad, self.critic_diff, self.critic_grad = self.train(time_step)
        else:
            # Zero diff as is not trained
            # self.actor_diff = 0.
            self.actor_grad = 0.
            # self.critic_diff = 0.
            self.critic_grad = 0.

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

예제 #17

파일 보기

class DDPG:
    """docstring for DDPG"""
    def __init__(self, sess, data_fname):
        self.name = 'DDPG'
        # Randomly initialize actor network and critic network
        # with both their target networks

        self.name = 'DDPG'  # name for uploading results
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = Hp.state_dim
        self.action_dim = Hp.action_dim
        print(self.state_dim, self.action_dim)

        self.sess = sess

        self.state_input = [
            tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord))
            for _ in xrange(Hp.categories)
        ]
        #tf.placeholder("float",[None,self.state_dim])
        self.target_state_input = [
            tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord))
            for _ in xrange(Hp.categories)
        ]
        #tf.placeholder("float",[None,self.state_dim])
        self.state_network = StateEnc(self.sess, self.state_input,
                                      self.target_state_input)
        state_batch = self.state_network.encoding
        next_state_batch = self.state_network.target_encoding

        weights, biases, w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 = self.state_network.get_parameters(
        )

        state_network_params = weights + biases + [
            w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2
        ]

        self.actor_network = ActorNetwork(self.sess, Hp.n_hidden,
                                          self.action_dim, self.state_input,
                                          state_batch, next_state_batch,
                                          state_network_params)
        self.critic_network = CriticNetwork(self.sess, Hp.n_hidden,
                                            self.action_dim, state_batch,
                                            next_state_batch)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(Hp.REPLAY_BUFFER_SIZE, data_fname)
        self.summary_str2 = None

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatches = self.replay_buffer.get_batch(Hp.batch_size * Hp.N_TRAIN)
        print("######### TRAINING   #############")
        for k in range(Hp.N_TRAIN):
            minibatch = minibatches[k * Hp.batch_size:(k + 1) * Hp.batch_size]
            state_batch_r = np.asarray([data[0] for data in minibatch])
            state_batch = []
            for j in range(Hp.categories):
                new_cat = np.stack(state_batch_r[:, j], axis=0)
                state_batch.append(new_cat)
            #state_batch = [np.expand_dims(state_batch, axis=1)]
            action_batch = np.asarray([data[1] for data in minibatch])
            reward_batch = np.asarray([data[2] for data in minibatch])
            next_state_batch_r = np.asarray([data[3] for data in minibatch])
            next_state_batch = []
            for j in range(Hp.categories):
                new_cat = np.stack(next_state_batch_r[:, j], axis=0)
                next_state_batch.append(new_cat)
            #next_state_batch = [np.expand_dims(next_state_batch, axis=1)]
            done_batch = np.asarray([data[4] for data in minibatch])

            # for action_dim = 1
            action_batch = np.resize(action_batch,
                                     [Hp.batch_size, self.action_dim])

            next_action_batch = self.actor_network.target_actions(
                self.target_state_input, next_state_batch)
            q_value_batch = self.critic_network.target_q(
                self.target_state_input, next_state_batch, next_action_batch)
            y_batch = []

            for i in range(len(minibatch)):
                if done_batch[i]:
                    y_batch.append(reward_batch[i])
                else:
                    y_batch.append(reward_batch[i] +
                                   Hp.GAMMA * q_value_batch[i])

            y_batch = np.resize(y_batch, [Hp.batch_size, 1])

            # Update critic by minimizing the loss L
            self.critic_network.train(y_batch, self.state_input, state_batch,
                                      action_batch)

            # Update the actor policy using the sampled gradient:
            action_batch_for_gradients = self.actor_network.actions(
                self.state_input, state_batch)
            q_gradient_batch = self.critic_network.gradients(
                self.state_input, state_batch, action_batch_for_gradients)

            self.summary_str2 = self.actor_network.train(
                q_gradient_batch, self.state_input, state_batch)

            # Update the target networks
            self.actor_network.update_target()
            self.critic_network.update_target()
            self.state_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        state = [np.expand_dims(el, axis=0) for el in state]
        action = self.actor_network.action(state)
        print("no noise ", action)
        return np.clip(
            action +
            self.exploration_noise.noise() * np.array([-17.0, 17.0, 900.0]),
            [-35.0, 0.0, 0.0], [0.0, 35.0, 2000.0])

    def action(self, state):
        state = [np.expand_dims(el, axis=0) for el in state]
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > Hp.REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

예제 #18

파일 보기

class DDPGAgent:
	def __init__(self, env):
		self.sess = tf.InteractiveSession()
		#self.params = loadparams() # ???
		self.env = env
		self.n_states = env.observation_space.shape[0]
		self.n_actions = env.action_space.shape[0]
		self.low = self.env.action_space.low
		self.high = self.env.action_space.high
		self.actor_network = ActorNetwork(self.sess, self.n_states, self.n_actions)
		self.trainable_var_count = self.actor_network.get_trainable_var_count()
		self.critic_network = CriticNetwork(self.sess, self.n_states, self.n_actions, \
			self.actor_network, self.trainable_var_count)
		self.replay_buffer = ReplayBuffer(BUFFER_SIZE) #params['buffer_size']???
		self.exploration_noise = OUNoise(self.n_actions)
		# self.noise = Noise()
		self.gamma = GAMMA
		self.sess.run(tf.global_variables_initializer())

	def getNoisyAction(self, current_state):
		current_state = np.reshape(current_state, (1, self.n_states))
		# print ("current_state =", np.shape(current_state))
		action = self.actor_network.predict(current_state)
		return np.clip(action + self.exploration_noise.noise(), self.low, self.high)

	def getAction(self, current_state):
		return self.actor_network.predict( \
			np.reshape(current_state, (1, self.n_states)))

	def observe(self, state, action, reward, state_, done):
		self.replay_buffer.add(state, action[0], reward, state_, done)
		# batch = tf.concat([batch, (state,action,reward,state_)]) # axis???
		if (self.replay_buffer.count > 500):
			batch = self.replay_buffer.sampleBatch(BATCH_SIZE)
			self.updateActorAndCritic(batch)
		if done:
			self.exploration_noise.reset()

	def updateActorAndCritic(self, batch):
		# states, actions, rewards, states_, dones = zip(*batch)
		states = np.asarray([data[0] for data in batch])
		actions = np.asarray([data[1] for data in batch])
		rewards = np.asarray([data[2] for data in batch])
		states_ = np.asarray([data[3] for data in batch])
		dones = np.asarray([data[4] for data in batch])

		current_batch_size = BATCH_SIZE

		states = np.reshape(states, (current_batch_size, self.n_states))
		# print("actions shape----------", np.shape(actions))
		# actions = np.reshape(actions, (current_batch_size, self.n_actions))
		states_ = np.reshape(states_, (current_batch_size, self.n_states))

		actions_ = self.actor_network.predict_target(states_)

		y_batch = []
		q_batch = []
		yi =[]
		for i in range(current_batch_size):
			if dones[i]:
				yi = rewards[i]
			else:
				yi = rewards[i] + \
					self.gamma * self.critic_network.predict_target( \
						np.reshape(states_[i], (1, self.n_states)), \
						np.reshape(actions[i],(1, self.n_actions)))
			y_batch.append(yi)

		y_batch = np.reshape(y_batch,(current_batch_size,1))

		# print("critic update begins")
		self.critic_network.update(y_batch, states, actions)
		# print("critic update ends")

		# print("action batch begins")
		action_batch_for_gradient = self.actor_network.predict(states)
		# print("action batch ends")
		# action_batch_for_gradient = np.reshape( \
		# 	action_batch_for_gradient,(current_batch_size, 1))
		# print("q batch gradient begins")
		q_gradient_batch = self.critic_network.get_action_gradient(states, action_batch_for_gradient)
		# print("q batch gradient done")
		# q_gradient_batch = np.reshape( \
		# 	q_gradient_batch,(current_batch_size,1))
		# print("actor update begins")
		self.actor_network.update(states, q_gradient_batch)
		# print("actor update ends")

	def save(self):
		self.critic_network.save()

예제 #19

파일 보기

파일: main.py 프로젝트: bermeom/quadruped-robot

def main():
    experiment = 'quadruped-robot-v0'  #specify environments here
    backupNameFile = "quadruped_robot_0"

    backupPathFile = "storage/" + backupNameFile
    bFullPath = os.path.join(
        os.path.split(os.path.abspath(__file__))[0], backupPathFile)

    env = gym.make(experiment)
    steps = env.spec.timestep_limit  #steps per episode
    assert isinstance(env.observation_space,
                      Box), "observation space must be continuous"
    assert isinstance(env.action_space, Box), "action space must be continuous"

    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer
    global agent
    agent = DDPG(env, is_batch_norm)
    exploration_noise = OUNoise(env.action_space.shape[0])
    counter = 0
    reward_per_episode = 0
    total_reward = 0
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    print("Number of States:", num_states)
    print("Number of Actions:", num_actions)
    print("Number of Steps per episode:", steps)
    #saving reward:
    reward_st = np.array([0])

    for i in range(episodes):
        print("==== Starting episode no:", i, "====", "\n")
        observation = env.reset()
        reward_per_episode = 0
        for t in range(steps):
            #rendering environmet (optional)
            env.render()
            x = observation
            action = agent.evaluate_actor(np.reshape(x, [1, num_states]))
            noise = exploration_noise.noise()
            action = action[
                0] + noise  #Select action according to current policy and exploration noise
            # print ("Action at step", t ," :",action,"\n")

            observation, reward, done, info = env.step(action)

            #add s_t,s_t+1,action,reward to experience memory
            agent.add_experience(x, observation, action, reward, done)
            #train critic and actor network
            if counter > 64:
                agent.train()
            reward_per_episode += reward
            counter += 1
            #check if episode ends:
            if (done or (t == steps - 1)):
                # print ('EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode)
                # print ("Printing reward to file")
                exploration_noise.reset(
                )  #reinitializing random noise for action exploration
                reward_st = np.append(reward_st, reward_per_episode)
                np.savetxt('episode_reward.txt', reward_st, newline="\n")
                print('\n\n')
                break
        # Save some episodes
        # print(episodes)
        # if (episodes == 10):
        # with open(bFullPath+"_EP_"+episodes+".pkl", 'wb') as file:
        #     pickle.dump(agent, file)
        # pickle.dump_session(bFullPath+"_EP_"+episodes+".pkl")
        # print ('SAVE EPISODE ',episodes)
        # break;
    total_reward += reward_per_episode
    print("Average reward per episode {}".format(total_reward / episodes))

예제 #20

파일 보기

class DDPG:
    """docstring for DDPG"""
    def __init__(self, env):
        mx.random.seed(seed)
        np.random.seed(seed)
        self.env = env
        if flg_gpu:
            self.ctx = mx.gpu(0)
        else:
            self.ctx = mx.cpu()
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.ddpgnet = DDPGNet(self.state_dim, self.action_dim)
        self.exploration_noise = OUNoise(self.action_dim)
        self.replay_buffer = ReplayBuffer(memory_size)

        self.batch_size = batch_size

        self.ddpgnet.init()
        self.train_step = 0

    def train(self):
        # print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(self.batch_size)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch,
                                 [self.batch_size, self.action_dim])

        # Calculate y_batch
        next_qvals = self.ddpgnet.get_target_q(next_state_batch).asnumpy()

        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * next_qvals[i][0])
        y_batch = np.resize(y_batch, [self.batch_size, 1])

        # Update critic by minimizing the loss L
        self.ddpgnet.update_critic(state_batch, action_batch, y_batch)

        # Update actor by maxmizing Q
        self.ddpgnet.update_actor(state_batch)

        self.train_step += 1
        # update target networks
        self.ddpgnet.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        state = np.reshape(state, (1, self.state_dim))
        action = self.ddpgnet.get_step_action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        state = np.reshape(state, (1, self.state_dim))
        action = self.ddpgnet.get_step_action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > memory_start_size:
            self.train()

            # if self.time_step % 10000 == 0:
            # self.actor_network.save_network(self.time_step)
            # self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

예제 #21

파일 보기

class RDPG:
    """docstring for RDPG"""
    def __init__(self, env):
        self.name = 'RDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        self.saver = tf.train.Saver()

    def train(self):
        # Sample a random minibatch of N sequences from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        # Construct histories
        observations = []
        next_observations = []
        actions = []
        rewards = []
        dones = []
        for each in minibatch:
            for i in range(1, len(each.observations)):
                observations.append(self.pad(each.observations[0:i]))
                next_observations.append(self.pad(each.observations[1, i + 1]))
                actions.append(each.actions[0:i - 1])
                rewards.append(each.rewards[0:i])
                if i == len(each.observations) - 1:
                    dones.append(True)
                else:
                    dones.append(False)
        # Calculate y_batch
        next_action_batch = self.actor_network.target_action(observations)
        q_value_batch = self.critic_network.target_q(
            next_observations,
            [self.pad(i + j) for (i, j) in zip(actions, next_action_batch)])
        y_batch = []
        for i in range(len(observations)):
            if dones[i]:
                y_batch.append(rewards[i][-1])
            else:
                y_batch.append(rewards[i][-1] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [len(observations), 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, observations,
                                  [self.pad(i) for i in actions])

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(observations)
        q_gradient_batch = self.critic_network.gradients(
            observations, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, observations)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def save_model(self, path, episode):
        self.saver.save(self.sess, path + "modle.ckpt", episode)

    def noise_action(self, history):
        # Select action a_t according to a sequence of observation and action
        action = self.actor_network.action(history)
        return action + self.exploration_noise.noise()

    def action(self, history):
        action = self.actor_network.action(history)
        return action

    def perceive(self, history):
        # Store the history sequence in the replay buffer
        self.replay_buffer.add(history)

        # Store history to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def pad(self, input):
        dim = len(input[0])
        return input + [[0] * dim] * (1000 - len(input))

예제 #22

파일 보기

파일: ddpg.py 프로젝트: yinchuandong/dqn-racer

class DDPG:

    def __init__(self, state_dim, state_channel, action_dim):
        self.state_dim = state_dim
        self.state_channel = state_channel
        self.action_dim = action_dim

        self.sess = tf.InteractiveSession()
        self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.target_state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.action_input = tf.placeholder('float', [None, action_dim])

        self.actor_network = ActorNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)

        # create network
        self.actor_network.create_network(self.state_input)
        self.critic_network.create_q_network(self.state_input, self.actor_network.action_output)

        # create target network
        self.actor_network.create_target_network(self.target_state_input)
        self.critic_network.create_target_q_network(self.target_state_input, self.actor_network.target_action_output)

        # create training method
        self.actor_network.create_training_method(self.critic_network.q_value_output)
        self.critic_network.create_training_method()

        self.sess.run(tf.initialize_all_variables())
        self.actor_network.update_target()
        self.critic_network.update_target()

        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        self.exploration_noise = OUNoise(self.action_dim)

        self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/models_ddpg'
        if not os.path.exists(self.dir_path):
            os.mkdir(self.dir_path)

        # for log
        self.reward_input = tf.placeholder(tf.float32)
        tf.scalar_summary('reward', self.reward_input)
        self.time_input = tf.placeholder(tf.float32)
        tf.scalar_summary('living_time', self.time_input)
        self.summary_op = tf.merge_all_summaries()
        self.summary_writer = tf.train.SummaryWriter(self.dir_path + '/log', self.sess.graph)

        self.episode_reward = 0.0
        self.episode_start_time = 0.0

        self.time_step = 1
        self.saver = tf.train.Saver(tf.all_variables())
        self.load_time_step()
        self.load_network()
        return

    def train(self):
        action_dim = self.action_dim

        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)  # sample BATCH_SIZE from replay_buffer
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # if action_dim = 1, it's a number not a array
        action_batch = np.resize(action_batch, [BATCH_SIZE, action_dim])

        # calculate y_batch via target network
        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q_value(next_state_batch, next_action_batch)

        y_batch = []
        for i in range(BATCH_SIZE):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # print np.shape(reward_batch), np.shape(y_batch)

        # train actor network
        self.actor_network.train(state_batch)

        # train critic network
        self.critic_network.train(y_batch, state_batch, action_batch)

        # update target network
        self.actor_network.update_target()
        self.critic_network.update_target()
        return

    def noise_action(self, state):
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def _record_log(self, reward, living_time):
        summary_str = self.sess.run(self.summary_op, feed_dict={
            self.reward_input: reward,
            self.time_input: living_time
        })
        self.summary_writer.add_summary(summary_str, self.time_step)
        return

    def perceive(self, state, action, reward, next_state, done):
        self.replay_buffer.add(state, action, reward, next_state, done)
        if self.episode_start_time == 0.0:
            self.episode_start_time = time.time()
        # for testing
        # self.time_step += 1
        # if self.time_step == 100:
        #     print '--------------------------------'
        #     self.replay_buffer.save_to_pickle()
        # return
        
        self.episode_reward += reward
        living_time = time.time() - self.episode_start_time
        if self.time_step % 1000 == 0 or done:
            self._record_log(self.episode_reward, living_time)

        if self.replay_buffer.size() > REPLAY_START_SIZE:
            self.train()

        if self.time_step % 100000 == 0:
            self.save_network()

        if done:
            print '===============reset noise========================='
            self.exploration_noise.reset()
            self.episode_reward = 0.0
            self.episode_start_time = time.time()

        self.time_step += 1
        return

    def load_time_step(self):
        if not os.path.exists(self.dir_path):
            return
        files = os.listdir(self.dir_path)
        step_list = []
        for filename in files:
            if ('meta' in filename) or ('-' not in filename):
                continue
            step_list.append(int(filename.split('-')[-1]))
        step_list = sorted(step_list)
        if len(step_list) == 0:
            return
        self.time_step = step_list[-1] + 1
        return

    def load_network(self):
        checkpoint = tf.train.get_checkpoint_state(self.dir_path)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print 'Successfully loaded:', checkpoint.model_checkpoint_path
        else:
            print 'Could not find old network weights'
        return

    def save_network(self):
        print 'save actor-critic network...', self.time_step
        self.saver.save(self.sess, self.dir_path + '/ddpg', global_step=self.time_step)
        return

예제 #23

파일 보기

def main():
    experiment = 'MountainCarContinuous-v0'
    env = gym.make(experiment)
    steps = env.spec.timestep_limit
    assert isinstance(env.observation_space, Box)
    assert isinstance(env.action_space, Box)

    agent = DDPG(env, is_batch_norm)  #这个在循环前面，所以所有的weight都有继承
    #也就是说，整个过程只训练了一个模型出来。
    exploration_noise = OUNoise(env.action_space.shape[0])
    reward_per_episode = 0
    total_reward = 0
    counter = 0
    num_states = env.observation_space.shape[0] - 1
    num_actions = env.action_space.shape[0]
    #这是state的维度和action的维度

    print 'Number of States:', num_states
    print 'Number of Actions:', num_actions
    print 'Number of steps per episode:', steps

    if is_exploration == True:
        print("\nExploration phase for {} steps. ".format(exploration_steps))
        e_steps = 0
        while e_steps < exploration_steps:
            s = env.reset()
            one_step = 0
            done = False
            exploration_noise.reset()
            exp = []
            while not done:
                a = exploration_noise.noise()
                ss, r, done, _ = env.step(a)
                exp.append((s[:-1], a, ss[:-1], r, done))
                s = ss
                one_step += 1
                if one_step > 998:
                    break
            agent.add_experience(exp)
            e_steps += 1

    reward_st = np.array([0])  #这个是用来存每一次的rewards的

    for i in xrange(episodes):  #一共要循环1000次
        print '====starting episode no:', i, '====', '\n'
        observation = env.reset()  #每个情节初始化，但是模型参数不初始化
        reward_per_episode = 0
        LSTM_SIZE = 40
        statec_t1 = np.zeros((BATCH_SIZE, LSTM_SIZE))
        stateh_t1 = np.zeros((BATCH_SIZE, LSTM_SIZE))
        exp = []
        for t in xrange(steps):
            #env.render()
            x = [observation[0:num_states]]
            x = np.reshape(x * BATCH_SIZE, [BATCH_SIZE, num_states])
            actor, statec_t1, stateh_t1 = agent.evaluate_actor(
                x, statec_t1, stateh_t1)
            noise = exploration_noise.noise()
            #ra = random.random()
            if (i < 500):
                action = actor[0] + noise
            else:
                action = actor[0]
            observation, reward, done, info = env.step(action)
            #print 'Action at step',t,':',action,'reward:',reward,'\n'
            exp.append((x, action, observation[0:num_states], reward, done))

            if counter > 64:
                agent.train()
            counter += 1
            reward_per_episode += reward
            if (done or (t == steps - 1)):
                #一个情节结束了～
                agent.add_experience(exp)
                print 'EPISODE:', i, 'Steps', t, 'Total Reward:', reward_per_episode
                print 'Printing reward to file'
                exploration_noise.reset()
                reward_st = np.append(reward_st, reward_per_episode)
                np.savetxt('episode_reward.txt', reward_st, newline='\n')
                print '\n\n'
                break

    total_reward += reward_per_episode
    #这里是计算平均值的
    print "Average reward per episode {}".format(total_reward / episodes)

예제 #24

파일 보기

파일: new_pretrained_ddpg.py 프로젝트: xuyuandong/simple-ddpg

class DDPG:
    """docstring for DDPG"""
    def __init__(self, state_space, action_dim):
        self.name = 'DDPG'  # name for uploading results
        self.sess = tf.Session()

        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_space = state_space
        self.action_dim = action_dim  # 1

        self.ac_network = ActorCriticNetwork(self.sess, self.state_space,
                                             self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Get Q target label
        # maxQ(s',a')
        q_value_batch = self.ac_network.target_q(next_state_batch)

        # Calculate target maxQ(s,a): y = reward + GAMMA * maxQ(s',a')
        y_batch = []
        batch_size = len(minibatch)
        for i in range(batch_size):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [batch_size, 1])

        # Update eval critic network by minimizing the loss L
        cost = self.ac_network.train_critic(y_batch, state_batch, action_batch)
        print('step_%d critic cost:' % self.ac_network.time_step, cost)

        # Update eval actor policy using the sampled gradient:
        self.ac_network.train_actor(state_batch)

        # Update the target networks
        self.ac_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.ac_network.actions(state)
        return action[0] + self.exploration_noise.noise()

    def action(self, state):
        action = self.ac_network.actions([state])
        return action[0]

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def sparse_tensor(self, state_batch, state_space):
        row = len(state_batch)
        indices = []
        for r in range(row):
            indices += [(r, c) for c in state_batch[r]]
        values = [1.0 for i in range(len(indices))]
        return tf.SparseTensorValue(indices=indices,
                                    values=values,
                                    dense_shape=[row, state_space])

예제 #25

파일 보기

파일: main.py 프로젝트: msh0576/RL_WCPS

        a_t = np.reshape(a_t,[1,N_ACTIONS])
        r_t = np.reshape(r_t,[1,1])
        if t == 0:
            #initializing history at time, t = 0
            
            h_t = np.hstack([o_t,a_t,r_t])
        else:
            h_t = np.append(h_t,np.hstack([o_t,a_t,r_t]),axis = 0)
        reward_per_episode += r_t
        #appending history:
                    
        o_t = o_t1
        if (done or (t == STEPS-1)):
                print('EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode)
                print("Printing reward to file")
                exploration_noise.reset() #reinitializing random noise for action exploration
                reward_st = np.append(reward_st,reward_per_episode)
                np.savetxt('episode_reward.txt',reward_st, newline="\n")
                print('\n\n')
                agent.add_to_replay(h_t,i)
                break
                
    if i == 0:
        #store episodes:
        R.append(h_t)
        #R = np.zeros([1,STEPS,NUM_ACTIONS+NUM_OUTPUTS+1])
        #R = np.append(R,np.reshape(h_t,[1,STEPS,NUM_ACTIONS+NUM_OUTPUTS+1]),axis = 0)
        #R = np.delete(R, (0), axis=0) #Initialing a zero array with size and deleting it back

    else:
        R.append(h_t)

예제 #26

파일 보기

파일: agent.py 프로젝트: rodrigoAMF/DRLND-Udacity

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, state_size_full,
                 action_size_full, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.state_size_full = state_size_full
        self.action_size_full = action_size_full
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(hyperparameters.device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(hyperparameters.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=hyperparameters.LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size_full, action_size_full,
                                   random_seed).to(hyperparameters.device)
        self.critic_target = Critic(state_size_full, action_size_full,
                                    random_seed).to(hyperparameters.device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=hyperparameters.LR_CRITIC,
            weight_decay=hyperparameters.WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

    def act(self, state, eps, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(hyperparameters.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += eps * self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

예제 #27

파일 보기

class DDPG:
    """docstring for DDPG"""
    def __init__(self, env, results_file):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        results_file.write(ActorNetwork.get_settings())

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

예제 #28

파일 보기

def trainer(epochs=1000, MINIBATCH_SIZE=40, GAMMA = 0.99, epsilon=1.0, min_epsilon=0.01, BUFFER_SIZE=10000, train_indicator=True, render=False):
    with tf.Session() as sess:


        # configuring environment
        env = gym.make(ENV_NAME)
        # configuring the random processes
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)
        env.seed(RANDOM_SEED)
        # info of the environment to pass to the agent
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        action_bound = np.float64(10) # I choose this number since the mountain continuos does not have a boundary
        # Creating agent
        ruido = OUNoise(action_dim, mu = 0.4) # this is the Ornstein-Uhlenbeck Noise
        actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU, DEVICE)
        critic = CriticNetwork(sess, state_dim, action_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars(), DEVICE)


        sess.run(tf.global_variables_initializer())

        # Initialize target network weights
        actor.update_target_network()
        critic.update_target_network()
        # Initialize replay memory
        replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

        goal = 0
        max_state = -1.
        try:
            critic.recover_critic()
            actor.recover_actor()
            print('********************************')
            print('models restored succesfully')
            print('********************************')
        except:
            pass
#            print('********************************')
#            print('Failed to restore models')
#            print('********************************')


        for i in range(epochs):

            state = env.reset()
            state = np.hstack(state)
            ep_reward = 0
            ep_ave_max_q = 0
            done = False
            step = 0
            max_state_episode = -1
            epsilon -= (epsilon/EXPLORE)
            epsilon = np.maximum(min_epsilon,epsilon)


            while (not done):

                if render:
                    env.render()

                #print('step', step)
                # 1. get action with actor, and add noise
                action_original = actor.predict(np.reshape(state,(1,state_dim))) # + (10. / (10. + i))* np.random.randn(1)
                action = action_original + max(epsilon,0)*ruido.noise()


                # remove comment if you want to see a step by step update
                # print(step,'a',action_original, action,'s', state[0], 'max state', max_state_episode)

                # 2. take action, see next state and reward :
                next_state, reward, done, info = env.step(action)

                if train_indicator:
                    # 3. Save in replay buffer:
                    replay_buffer.add(np.reshape(state, (actor.s_dim,)), np.reshape(action, (actor.a_dim,)), reward,
                                      done, np.reshape(next_state, (actor.s_dim,)))

                    # Keep adding experience to the memory until
                    # there are at least minibatch size samples
                    if replay_buffer.size() > MINIBATCH_SIZE:

                        # 4. sample random minibatch of transitions:
                        s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(MINIBATCH_SIZE)

                        # Calculate targets

                        # 5. Train critic Network (states,actions, R + gamma* V(s', a')):
                        # 5.1 Get critic prediction = V(s', a')
                        # the a' is obtained using the actor prediction! or in other words : a' = actor(s')
                        target_q = critic.predict_target(s2_batch, actor.predict_target(s2_batch))

                        # 5.2 get y_t where:
                        y_i = []
                        for k in range(MINIBATCH_SIZE):
                            if t_batch[k]:
                                y_i.append(r_batch[k])
                            else:
                                y_i.append(r_batch[k] + GAMMA * target_q[k])


                        # 5.3 Train Critic!
                        predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                        ep_ave_max_q += np.amax(predicted_q_value)

                        # 6 Compute Critic gradient (depends on states and actions)
                        # 6.1 therefore I first need to calculate the actions the current actor would take.
                        a_outs = actor.predict(s_batch)
                        # 6.2 I calculate the gradients
                        grads = critic.action_gradients(s_batch, a_outs)
                        actor.train(s_batch, grads[0])

                        # Update target networks
                        actor.update_target_network()
                        critic.update_target_network()


                state = next_state
                if next_state[0] > max_state_episode:
                    max_state_episode = next_state[0]

                ep_reward = ep_reward + reward
                step +=1

            if done:
                ruido.reset()
                if state[0] > 0.45:
                    #print('****************************************')
                    #print('got it!')
                    #print('****************************************')
                    goal += 1

            if max_state_episode > max_state:
                max_state = max_state_episode
            print('th',i+1,'n steps', step,'R:', round(ep_reward,3),'Pos', round(epsilon,3),'Efficiency', round(100.*((goal)/(i+1.)),3) )


            # print('Efficiency', 100.*((goal)/(i+1.)))


        print('*************************')
        print('now we save the model')
        critic.save_critic()
        actor.save_actor()
        print('model saved succesfuly')
        print('*************************')

예제 #29

파일 보기

파일: ddpg.py 프로젝트: ChampionZP/DDPG

class DDPG:
    """docstring for DDPG"""
    def __init__(self, env):
        self.name = 'DDPG' # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim)
        self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim)
        
        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch,[BATCH_SIZE,self.action_dim])

        # Calculate y_batch
        
        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,next_action_batch)
        y_batch = []  
        for i in range(len(minibatch)): 
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else :
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch,[BATCH_SIZE,1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch,state_batch,action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch,state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self,state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action+self.exploration_noise.noise()

    def action(self,state):
        action = self.actor_network.action(state)
        return action

    def perceive(self,state,action,reward,next_state,done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state,action,reward,next_state,done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() >  REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
            #self.actor_network.save_network(self.time_step)
            #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

예제 #30

파일 보기

파일: rosinterface.py 프로젝트: kkelchte/pilot

class PilotNode(object):
  """Node to listen to ROS topics like depth, rgb input and supervised control.
  The node also publishes to pilot control and predicted depth for visualization.
  """
  
  def __init__(self, FLAGS, model, logfolder):
    print('initialize pilot node')  
    self.FLAGS=FLAGS
    # Initialize fields
    self.logfolder = logfolder
    f=open(os.path.join(self.logfolder,'tf_log'),'a')
    f.write(self.FLAGS.log_tag)
    f.write('\n')
    f.close()
    self.model = model 
    self.ready=False 
    self.finished=True
    self.training=False
    
    self.last_pose=[] # previous pose, used for accumulative distance
    self.world_name = ''
    self.runs={'train':0, 'test':0} # number of online training run (used for averaging)
    # self.accumlosses = {} # gather losses and info over the run in a dictionary
    self.current_distance=0 # accumulative distance travelled from beginning of run used at evaluation
    self.furthest_point=0 # furthest point reached from spawning point at the beginning of run
    self.average_distances={'train':0, 'test':0} # running average over different runs
    self.target_control = [] # field to keep the latest supervised control
    self.target_depth = [] # field to keep the latest supervised depth
    self.nfc_images =[] #used by n_fc networks for building up concatenated frames
    self.exploration_noise = OUNoise(4, 0, self.FLAGS.ou_theta,1)
    if not self.FLAGS.dont_show_depth: self.depth_pub = rospy.Publisher('/depth_prediction', numpy_msg(Floats), queue_size=1)
    self.action_pub=rospy.Publisher('/nn_vel', Twist, queue_size=1)

    self.model.reset_metrics()

    rospy.Subscriber('/nn_start', Empty, self.ready_callback)
    rospy.Subscriber('/nn_stop', Empty, self.finished_callback)

    # extract imitation loss from supervised velocity
    rospy.Subscriber('/supervised_vel', Twist, self.supervised_callback)
    
    self.start_time = 0
    self.imitation_loss=[]
    self.depth_prediction=[]
    self.depth_loss=[]
    self.driving_duration=-1

    self.skip_frames = 0
    self.img_index = 0
    self.fsm_index = 0

    if rospy.has_param('rgb_image'): 
      image_topic=rospy.get_param('rgb_image')
      if 'compressed' in image_topic:
        rospy.Subscriber(image_topic, CompressedImage, self.compressed_image_callback)
      else:
        rospy.Subscriber(image_topic, Image, self.image_callback)
    if rospy.has_param('depth_image'):
      depth_topic = rospy.get_param('depth_image')
      if 'scan' in depth_topic:
        rospy.Subscriber(depth_topic, LaserScan, self.scan_depth_callback)
      else:
        rospy.Subscriber(depth_topic, Image, self.depth_callback)
    if not self.FLAGS.real: # initialize the replay buffer
      self.replay_buffer = ReplayBuffer(self.FLAGS, self.FLAGS.random_seed)
      self.accumloss = 0
      if rospy.has_param('gt_info'):
        rospy.Subscriber(rospy.get_param('gt_info'), Odometry, self.gt_callback)

    # Add some lines to debug delays:
    self.time_im_received=[]
    self.time_ctr_send=[]

    rospy.init_node('pilot', anonymous=True)  
    
       
  def ready_callback(self,msg):
    """ callback function that makes DNN policy starts the ready flag is set on 1 (for 3s)"""
    if not self.ready and self.finished:
      print('Neural control activated.')
      self.ready = True
      self.start_time = rospy.get_time()
      self.finished = False
      self.exploration_noise.reset()
      # choose one speed for this flight
      self.FLAGS.speed=self.FLAGS.speed + (not self.FLAGS.evaluate)*np.random.uniform(-self.FLAGS.sigma_x, self.FLAGS.sigma_x)
      if rospy.has_param('evaluate'):
        self.FLAGS.evaluate = rospy.get_param('evaluate')
        print '--> set evaluate to: {0} with speed {1}'.format(self.FLAGS.evaluate, self.FLAGS.speed)
      if rospy.has_param('skip_frames'):
        self.skip_frames = rospy.get_param('skip_frames')
        print '--> set skip_frames to: {0}'.format(self.skip_frames)
      if rospy.has_param('world_name') :
        self.world_name = rospy.get_param('world_name')
      time.sleep(1) # wait one second, otherwise create_dataset can't follow...
        
  def gt_callback(self, data):
    """Callback function that keeps track of positions for logging"""
    if not self.ready or self.training: return
    current_pos=[data.pose.pose.position.x,
                    data.pose.pose.position.y,
                    data.pose.pose.position.z]
    if len(self.last_pose)!= 0:
        self.current_distance += np.sqrt((self.last_pose[0,3]-current_pos[0])**2+(self.last_pose[1,3]-current_pos[1])**2)
    self.furthest_point=max([self.furthest_point, np.sqrt(current_pos[0]**2+current_pos[1]**2)])

    # Get pose (rotation and translation) [DEPRECATED: USED FOR ODOMETRY]
    quaternion = (data.pose.pose.orientation.x,
      data.pose.pose.orientation.y,
      data.pose.pose.orientation.z,
      data.pose.pose.orientation.w)
    self.last_pose = transformations.quaternion_matrix(quaternion) # orientation of current frame relative to global frame
    self.last_pose[0:3,3]=current_pos

  def process_rgb(self, msg):
    """ Convert RGB serial data to opencv image of correct size"""
    try:
      # Convert your ROS Image message to OpenCV2
      # changed to normal RGB order as i ll use matplotlib and PIL instead of opencv
      img =bridge.imgmsg_to_cv2(msg, 'rgb8') 
    except CvBridgeError as e:
      print(e)
    else:
      img = img[::2,::5,:]
      size = self.model.input_size[1:]
      img = sm.resize(img,size,mode='constant').astype(float)
      return img

  def process_rgb_compressed(self, msg):
    """ Convert RGB serial data to opencv image of correct size"""
    # if not self.ready or self.finished: return []
    try:
      img = bridge.compressed_imgmsg_to_cv2(msg, desired_encoding='passthrough')
    except CvBridgeError as e:
      print(e)
    else:
      # 308x410 to 128x128
      img = img[::2,::3,:]
      size = self.model.input_size[1:]
      img = sm.resize(img,size,mode='constant').astype(float)
      return img

  def process_depth(self, msg):
    """ Convert depth serial data to opencv image of correct size"""
    # if not self.ready or self.finished: return [] 
    try:
      # Convert your ROS Image message to OpenCV2
      de = bridge.imgmsg_to_cv2(msg, desired_encoding='passthrough')#gets float of 32FC1 depth image
    except CvBridgeError as e:
      print(e)
    else:
      
      de = de[::6,::8]
      shp=de.shape
      # # assume that when value is not a number it is due to a too large distance (set to 5m)
      # # values can be nan for when they are closer than 0.5m but than the evaluate node should
      # # kill the run anyway.
      de=np.asarray([ e*1.0 if not np.isnan(e) else 5 for e in de.flatten()]).reshape(shp) # clipping nans: dur: 0.010
      size = (55,74)
      # print 'DEPTH: min: ',np.amin(de),' and max: ',np.amax(de)
      
      de = sm.resize(de,size,order=1,mode='constant', preserve_range=True)
      return de

  def process_scan(self, msg):
    """Preprocess serial scan: clip horizontal field of view, clip at 1's and ignore 0's, smooth over 4 bins."""
    # field of view should follow camera: 
    #    wide-angle camera: -60 to 60. 
    #    normal camera: -35 to 35.
    ranges=[1 if r > 1 or r==0 else r for r in msg.ranges]
    # clip left 45degree range from 0:45 reversed with right 45degree range from the last 45:
    ranges=list(reversed(ranges[:self.FLAGS.field_of_view/2]))+list(reversed(ranges[-self.FLAGS.field_of_view/2:]))
    # add some smoothing by averaging over 4 neighboring bins
    ranges = [sum(ranges[i*self.FLAGS.smooth_scan:i*self.FLAGS.smooth_scan+self.FLAGS.smooth_scan])/self.FLAGS.smooth_scan for i in range(int(len(ranges)/self.FLAGS.smooth_scan))]
    # make it a numpy array
    de = np.asarray(ranges).reshape((1,-1))
    # if list(de.shape) != self.model.output_size: # reshape if necessary
    #   de = sm.resize(de,self.model.output_size,order=1,mode='constant', preserve_range=True)
    return de
    
  def compressed_image_callback(self, msg):
    """ Process serial image data with process_rgb and concatenate frames if necessary"""
    im = self.process_rgb_compressed(msg)
    if len(im)!=0: 
      self.process_input(im)
  
  def image_callback(self, msg):
    """ Process serial image data with process_rgb and concatenate frames if necessary"""
    self.time_im_received.append(time.time())
    im = self.process_rgb(msg)
    if len(im)!=0: 
      if 'nfc' in self.FLAGS.network: # when features are concatenated, multiple images should be kept.
        self.nfc_images.append(im)
        if len(self.nfc_images) < self.FLAGS.n_frames: return
        else:
          # concatenate last n-frames
          im = np.concatenate(np.asarray(self.nfc_images[-self.FLAGS.n_frames:]),axis=2)
          self.nfc_images = self.nfc_images[-self.FLAGS.n_frames+1:] # concatenate last n-1-frames
      self.process_input(im)
    
  def depth_callback(self, msg):
    im = self.process_depth(msg)
    if len(im)!=0 and self.FLAGS.auxiliary_depth:
      self.target_depth = im
  
  def scan_depth_callback(self, msg):
    im = self.process_scan(msg)
    if len(im)!=0:
      self.depth = im
      # calculate depth loss on the fly
      if len(self.depth_prediction) != 0:
        # print("pred: {0} trg: {1}".format(self.depth_prediction, self.depth))
        self.depth_loss.append(np.mean((self.depth_prediction - self.depth.flatten())**2))

  def process_input(self, im):
    """Process the inputs: images, targets, auxiliary tasks
      Predict control based on the inputs.
      Plot auxiliary predictions.
      Fill replay buffer.
    """
    # skip a number of frames to lower the actual control rate
    # independently of the image frame rate
    if self.skip_frames != 0:
      self.img_index+=1
      if self.img_index % (self.skip_frames+1) != 0:
        return

    aux_depth=[] # variable to keep predicted depth 
    trgt = -100.
    inpt=im
    
    if self.FLAGS.evaluate: ### EVALUATE
      trgt=np.array([[self.target_control[5]]]) if len(self.target_control) != 0 else []
      trgt_depth = np.array([copy.deepcopy(self.target_depth)]) if len(self.target_depth) !=0 and self.FLAGS.auxiliary_depth else []
      control, aux_results = self.model.forward([inpt], auxdepth= not self.FLAGS.dont_show_depth,targets=trgt, depth_targets=trgt_depth)
      if not self.FLAGS.dont_show_depth and self.FLAGS.auxiliary_depth and len(aux_results)>0: aux_depth = aux_results['d']
    else: ###TRAINING
      # Get necessary labels, if label is missing wait...
      def check_field(target_name):
        if len (target_name) == 0:
          # print('Waiting for target {}'.format(target_name))
          return False
        else:
          return True
      if not check_field(self.target_control): return
      else: 
        trgt = self.target_control[5]
      if self.FLAGS.auxiliary_depth:
        if not check_field(self.target_depth): 
          return
        else: 
          trgt_depth = copy.deepcopy(self.target_depth)
      control, aux_results = self.model.forward([inpt], auxdepth=not self.FLAGS.dont_show_depth)
      if not self.FLAGS.dont_show_depth and self.FLAGS.auxiliary_depth: aux_depth = aux_results['d']
    
    ### SEND CONTROL
    control = control[0]
    # print control
    if trgt != -100 and not self.FLAGS.evaluate: # policy mixing with self.FLAGS.alpha
      action = trgt if np.random.binomial(1, self.FLAGS.alpha**(self.runs['train']+1)) else control
    else:
      action = control
    msg = Twist()
    msg.linear.x = self.FLAGS.speed 
    if self.FLAGS.noise == 'ou':
      noise = self.exploration_noise.noise()
      msg.linear.y = (not self.FLAGS.evaluate)*noise[1]*self.FLAGS.sigma_y
      msg.linear.z = (not self.FLAGS.evaluate)*noise[2]*self.FLAGS.sigma_z
      msg.angular.z = max(-1,min(1,action+(not self.FLAGS.evaluate)*self.FLAGS.sigma_yaw*noise[3]))
    elif self.FLAGS.noise == 'uni':
      # msg.linear.x = self.FLAGS.speed + (not self.FLAGS.evaluate)*np.random.uniform(-self.FLAGS.sigma_x, self.FLAGS.sigma_x)
      msg.linear.y = (not self.FLAGS.evaluate)*np.random.uniform(-self.FLAGS.sigma_y, self.FLAGS.sigma_y)
      msg.linear.z = (not self.FLAGS.evaluate)*np.random.uniform(-self.FLAGS.sigma_z, self.FLAGS.sigma_z)
      msg.angular.z = max(-1,min(1,action+(not self.FLAGS.evaluate)*np.random.uniform(-self.FLAGS.sigma_yaw, self.FLAGS.sigma_yaw)))
    else:
      raise IOError( 'Type of noise is unknown: {}'.format(self.FLAGS.noise))
    # if np.abs(msg.angular.z) > 0.3: msg.linear.x =  0.
    if np.abs(msg.angular.z) > 0.3 and self.FLAGS.break_and_turn: msg.linear.x = 0. + np.random.binomial(1, 0.1)

    self.action_pub.publish(msg)
    self.time_ctr_send.append(time.time())

    ### keep track of imitation loss on the fly
    if len(self.target_control) != 0:
      self.imitation_loss.append((self.target_control[5]-action)**2)

    if not self.FLAGS.dont_show_depth and len(aux_depth) != 0 and not self.finished:
      aux_depth = aux_depth.flatten()
      self.depth_pub.publish(aux_depth)
      aux_depth = []
      
    # ADD EXPERIENCE REPLAY
    if not self.FLAGS.evaluate and trgt != -100:
      experience={'state':im,
                  'action':action,
                  'trgt':trgt}
      if self.FLAGS.auxiliary_depth: experience['target_depth']=trgt_depth
      self.replay_buffer.add(experience)
      # print("added experience: {0} vs {1}".format(action, trgt))

  def supervised_callback(self, data):
    """Get target control from the /supervised_vel node"""
    # print 'received control'

    if not self.ready: return
    self.target_control = [data.linear.x,
      data.linear.y,
      data.linear.z,
      data.angular.x,
      data.angular.y,
      data.angular.z]
      
  def finished_callback(self,msg):
    """When run is finished:
        sample 10 batches from the replay buffer,
        apply gradient descent on the model,
        write log file and checkpoints away
    """
    if self.ready and not self.finished:
      print('neural control deactivated. @ time: {}'.format(time.time()))

      self.ready=False
      self.finished=True
      if self.start_time!=0: 
        self.driving_duration = rospy.get_time() - self.start_time

      
      # Train model from experience replay:
      # Train the model with batchnormalization out of the image callback loop
      depth_predictions = []
      losses_train = {}
      if self.replay_buffer.size()>self.FLAGS.batch_size and not self.FLAGS.evaluate:
        for b in range(min(int(self.replay_buffer.size()/self.FLAGS.batch_size), 10)):
          inputs, targets, aux_info = self.replay_buffer.sample_batch(self.FLAGS.batch_size)
          if b==0:
            if self.FLAGS.plot_depth and self.FLAGS.auxiliary_depth:
              depth_predictions = tools.plot_depth(inputs, aux_info['target_depth'].reshape(-1,55,74))
          depth_targets=[]
          if self.FLAGS.auxiliary_depth: 
            depth_targets=aux_info['target_depth'].reshape(-1,55,74)
          losses = self.model.backward(inputs,targets[:].reshape(-1,1),depth_targets)
          for k in losses.keys(): 
            try:
              losses_train[k].append(losses[k])
            except:
              losses_train[k]=[losses[k]]
      
      # Gather all info to build a proper summary and string of results
      k='train' if not self.FLAGS.evaluate else 'test'
      self.average_distances[k]= self.average_distances[k]-self.average_distances[k]/(self.runs[k]+1)
      self.average_distances[k] = self.average_distances[k]+self.current_distance/(self.runs[k]+1)
      self.runs[k]+=1
      sumvar={}
      result_string='{0}: run {1}'.format(time.strftime('%H:%M'),self.runs[k])
      vals={'current':self.current_distance, 'furthest':self.furthest_point}
      for d in ['current', 'furthest']:
        name='Distance_{0}_{1}'.format(d,'train' if not self.FLAGS.evaluate else 'test')
        if len(self.world_name)!=0: name='{0}_{1}'.format(name,self.world_name)
        sumvar[name]=vals[d]
        result_string='{0}, {1}:{2}'.format(result_string, name, vals[d])
      for k in losses_train.keys():
        name={'total':'Loss_train_total'}
        sumvar[name[k]]=np.mean(losses_train[k])
        result_string='{0}, {1}:{2}'.format(result_string, name[k], np.mean(losses_train[k]))
      
      # get all metrics of this episode and add them to var
      results = self.model.get_metrics()
      for k in results.keys(): 
        sumvar[k] = results[k]
        result_string='{0}, {1}:{2}'.format(result_string, k, results[k])
      
      if self.FLAGS.plot_depth and self.FLAGS.auxiliary_depth:
        sumvar["depth_predictions"]=depth_predictions
      # add driving duration (collision free)
      if self.driving_duration != -1: 
        result_string='{0}, driving_duration: {1:0.3f}'.format(result_string, self.driving_duration)
        sumvar['driving_time']=self.driving_duration
      # add imitation loss
      if len(self.imitation_loss)!=0:
        result_string='{0}, imitation_loss: {1:0.3}'.format(result_string, np.mean(self.imitation_loss))
        sumvar['imitation_loss']=np.mean(self.imitation_loss)
      # add depth loss
      if len(self.depth_loss)!=0:
        result_string='{0}, depth_loss: {1:0.3f}, depth_loss_var: {2:0.3f}'.format(result_string, np.mean(self.depth_loss), np.var(self.depth_loss))
        sumvar['depth_loss']=np.mean(self.depth_loss)
      if len(self.time_ctr_send) > 10 and len(self.time_im_received) > 10:
        # calculate control-rates and rgb-rates from differences
        avg_ctr_rate = 1/np.mean([self.time_ctr_send[i+1]-self.time_ctr_send[i] for i in range(len(self.time_ctr_send)-1)])
        std_ctr_delays = np.std([self.time_ctr_send[i+1]-self.time_ctr_send[i] for i in range(len(self.time_ctr_send)-1)])
        avg_im_rate = 1/np.mean([self.time_im_received[i+1]-self.time_im_received[i] for i in range(1,len(self.time_im_received)-1)]) #skip first image delay as network still needs to 'startup'
        std_im_delays = np.std([self.time_ctr_send[i+1]-self.time_ctr_send[i] for i in range(len(self.time_ctr_send)-1)])

        result_string='{0}, control_rate: {1:0.3f}, image_rate: {2:0.3f} , control_delay_std: {1:0.3f}, image_delay_std: {2:0.3f} '.format(result_string, avg_ctr_rate, avg_im_rate, std_ctr_delays, std_im_delays)
      try:
        self.model.summarize(sumvar)
      except Exception as e:
        print('failed to write', e)
        pass
      else:
        print(result_string)
      # ! Note: tf_log is used by evaluate_model train_model and train_and_evaluate_model in simulation_supervised/scripts
      # Script starts next run once this file is updated.
      try:
        f=open(os.path.join(self.logfolder,'tf_log'),'a')
        f.write(result_string)
        f.write('\n')
        f.close()
      except Exception as e:
        print('failed to write txt tf_log {}'.format(e))
        print('retry after sleep 60')
        time.sleep(60)
        f=open(os.path.join(self.logfolder,'tf_log'),'a')
        f.write(result_string)
        f.write('\n')
        f.close()
      # self.accumlosses = {}
      self.current_distance = 0
      self.last_pose = []
      self.nfc_images = []
      self.furthest_point = 0
      self.world_name = ''
      if self.runs['train']%20==0 and not self.FLAGS.evaluate:
        # Save a checkpoint every 20 runs.
        self.model.save(self.logfolder)
        print('model saved [run {0}]'.format(self.runs['train']))
      self.time_im_received=[]
      self.time_ctr_send=[]

      self.model.reset_metrics()
      
      self.start_time=0
      self.imitation_loss=[]
      self.depth_loss=[]
      self.driving_duration=-1
      self.img_index=0    
      self.fsm_index = 0

예제 #31

파일 보기

파일: new_dppg.py 프로젝트: queekye/purePython

s_dim1 = env.s_dim
a_dim1 = env.a_dim
a_bound1 = env.a_bound

ddpg = DDPG(a_dim1, s_dim1, a_bound1, MAP_DIM, att_dim=32)
exploration_noise = OUNoise(a_dim1)  # control exploration
t1 = time.time()
replay_num = 0
env.set_map_seed(187)
for i in range(MAX_EPISODES):
    t_start = time.time()
    sd = i * 3 + 100

    m_sd, s, gm, loc = env.set_state_seed(sd)
    lm = env.get_local_map(loc)
    exploration_noise.reset()
    ep_reward = 0
    ave_dw = 0
    j = 0
    r = 0
    for j in range(MAX_EP_STEPS):
        # Add exploration noise
        a = ddpg.choose_action(s, gm, lm)
        ave_dw += np.linalg.norm(a)
        a += exploration_noise.noise(
        )  # add randomness to action selection for exploration
        a = np.minimum(a_bound1, np.maximum(-a_bound1, a))
        a[0:4] /= max(np.linalg.norm(a[0:4]), 1e-8)

        s_, loc_, r, done = env.step(a)

예제 #32

파일 보기

class DDPG():
    """Reinforcement Learning agent , learning using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.08
        self.exploration_sigma = 0.15
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.95  # discount factor 0.99
        self.tau = 0.001  # for soft update of target parameters 0.01

        # Score tracker and learning parameters
        self.total_reward = None
        self.count = 0
        self.score = 0
        self.best_score = -np.inf
        self.last_state = None

    def reset_episode(self):

        self.total_reward = None
        self.count = 0

        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):

        if self.total_reward:
            self.total_reward += reward
        else:
            self.total_reward = reward

        self.count += 1

        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        states = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(states)[0]
        # add some noise for exploration
        return list(action + self.noise.sample())

    def learn(self, experiences):
        """Update policy and value parameters using given batch of reward tuples."""

        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted actions of next-state  and Q values from target models
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        # track best score
        self.score = self.total_reward / float(
            self.count) if self.count else -np.inf
        if self.best_score < self.score:
            self.best_score = self.score

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

예제 #33

파일 보기

파일: ddpg.py 프로젝트: Ivehui/DDPG

class DDPG:
    """docstring for DDPG"""
    def __init__(self, environment):
        self.name = 'DDPG' # name for uploading results
        self.environment = environment
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.actor_network = ActorNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0])
        self.critic_network = CriticNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0])
        # initialize replay buffer
        self.replay_buffer = deque()

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(environment.action_space.shape[0])

        # Initialize time step
        self.time_step = 0

    def set_init_observation(self,observation):
        # receive initial observation state
        self.state = observation

    def train(self):
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = random.sample(self.replay_buffer,BATCH_SIZE)
        state_batch = [data[0] for data in minibatch]
        action_batch = [data[1] for data in minibatch]
        reward_batch = [data[2] for data in minibatch]
        next_state_batch = [data[3] for data in minibatch]

        action_batch = np.resize(action_batch,[BATCH_SIZE,1])

        # Calculate y
        y_batch = []
        next_action_batch = self.actor_network.target_evaluate(next_state_batch)
        q_value_batch = self.critic_network.target_evaluate(next_state_batch,next_action_batch)
        for i in range(0,BATCH_SIZE):
            done = minibatch[i][4]
            if done:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch,state_batch,action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.evaluate(state_batch)
        q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients)/BATCH_SIZE

        self.actor_network.train(q_gradient_batch,state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def get_action(self):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.get_action(self.state)
        return np.clip(action+self.exploration_noise.noise(),self.environment.action_space.low,self.environment.action_space.high)

    def set_feedback(self,observation,action,reward,done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        next_state = observation
        self.replay_buffer.append((self.state,action,reward,next_state,done))
        # Update current state
        self.state = next_state
        # Update time step
        self.time_step += 1

        # Limit the replay buffer size
        if len(self.replay_buffer) > REPLAY_BUFFER_SIZE:
            self.replay_buffer.popleft()

        # Store transitions to replay start size then start training
        if self.time_step >  REPLAY_START_SIZE:
            self.train()

        if self.time_step % 10000 == 0:
            self.actor_network.save_network(self.time_step)
            self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

예제 #34

파일 보기

파일: push3dofreal_train_baseline.py 프로젝트: leopauly/Observation-Learning-Simulations

def s2l():

    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer
    num_states = feature_size  #num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    print("Number of States:", num_states)
    print("Number of Actions:", num_actions)

    agent = DDPG(env, is_batch_norm, num_states, num_actions)
    exploration_noise = OUNoise(env.action_space.shape[0])
    counter = 0
    total_reward = 0

    print("Number of Rollouts per episode:", num_rollouts)
    print("Number of Steps per roll out:", steps)
    reward_st = np.array([0])  #saving reward
    eval_metric_st = np.array([0])
    reward_st_all = np.array([0])  #saving reward after every step

    frame_obj = Frame_Feature()

    #activity_obj=Vid_Feature()
    demo_vid_array = demo_array_extractor(demo_folder)
    demo_features = frame_obj.video_feature_extractor(demo_vid_array)

    for episode in range(num_episodes):
        print("==== Starting episode no:", episode, "====", "\n")
        env.reset()  # Reset env in the begining of each episode
        env.render()
        obs_img = env.render(mode='rgb_array')  # Get the observation
        obs_img = np.array(misc.imresize(obs_img, [112, 112, 3]))
        observation = np.array(frame_obj.frame_feature_extractor(obs_img))
        observation = observation.reshape(-1)
        reward_per_episode = 0

        for t in range(num_rollouts):

            reward_per_rollout = 0
            vid_robo_ = []

            for i in range(steps):

                x = observation

                action = agent.evaluate_actor(np.reshape(x, [1, num_states]))
                noise = exploration_noise.noise()
                action = action[
                    0] + noise  #Select action according to current policy and exploration noise
                print('Action at episode-', episode, 'rollout-', t, 'step-', i,
                      " :", action)

                _, _, done, info = env.step(action)
                env.render()
                obs_robo_ = env.render(mode='rgb_array')  # Get the observation
                obs_robo = misc.imresize(obs_robo_, [112, 112, 3])
                vid_robo_.append(obs_robo)
                observation = np.array(
                    frame_obj.frame_feature_extractor(np.array(obs_robo)))
                observation = observation.reshape(-1)
                #pasue()

                if (i == 15):
                    vid_robo = np.array(vid_robo_)
                    robo_features = frame_obj.video_feature_extractor(vid_robo)
                    reward = -(distance(demo_features, robo_features))
                    reward = np.array(reward)
                    print('reward: ', reward)
                else:
                    reward = 0
                    reward = np.array(reward)
                    print('reward: ', reward)

                # Printing eval_metric after every rollout
                eval_metric = np.array(env.get_eval())
                eval_metric = eval_metric.reshape(-1)
                print('Distance to goal:', eval_metric)
                eval_metric_st = np.append(eval_metric_st, eval_metric)
                np.savetxt('eval_metric_per_step.txt',
                           eval_metric_st,
                           newline="\n")

                # Storing reward after every rollout
                reward_st_all = np.append(reward_st_all, reward)
                np.savetxt('reward_all.txt', reward_st_all, newline="\n")

                #add s_t,s_t+1,action,reward to experience memory
                agent.add_experience(x, observation, action, reward, False)
                reward_per_rollout += reward
                counter += 1

            #train critic and actor network
            if counter > start_training:
                agent.train()
            print('\n\n')

            #Saving policy
            if ((episode % 50) == 0 and t == num_rollouts - 1):
                print('saving policy...........................!')
                agent.save_actor(episode)

            reward_per_episode += reward_per_rollout

        #check if episode ends:

        print('EPISODE: ', episode, ' Total Reward: ', reward_per_episode)
        print("Printing reward to file")
        exploration_noise.reset(
        )  #reinitializing random noise for action exploration
        reward_st = np.append(reward_st, reward_per_episode)
        np.savetxt('episode_reward.txt', reward_st, fmt='%f', newline="\n")
        print('\n\n')

        total_reward += reward_per_episode

    print("Average reward per episode {}".format(total_reward / num_episodes))

예제 #35

파일 보기

class Agent:
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def load(self, model_dir, agent_id):
        # Load Actor and Critic network weights
        self.actor_local.load_state_dict(
            torch.load(
                os.path.join(model_dir,
                             'agent_{0}_actor.pth'.format(agent_id))))
        self.critic_local.load_state_dict(
            torch.load(
                os.path.join(model_dir,
                             'agent_{0}_critic.pth'.format(agent_id))))

    def save(self, model_dir, agent_id):
        # Save Actor and Critic network weights
        torch.save(
            self.actor_local.state_dict(),
            os.path.join(model_dir, 'agent_{0}_actor.pth'.format(agent_id)))
        torch.save(
            self.critic_local.state_dict(),
            os.path.join(model_dir, 'agent_{0}_critic.pth'.format(agent_id)))

예제 #36

파일 보기

class AgentDDPG:
    def __init__(self, env, state_size, action_size):
        self.env = env
        self.replay_memory = deque()
        self.actor_network = actor_network.ActorNetwork(
            state_size, action_size)
        self.critic_network = critic_network.CriticNetwork(
            state_size, action_size)

        self.ou_noise = OUNoise(action_size)

        self.time_step = 0

    def set_state(self, obs):
        self.state = obs

    def get_action(self):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.get_action(self.state)
        return np.clip(action + self.ou_noise.noise(),
                       self.env.action_space.low, self.env.action_space.high)

    def set_feedback(self, obs, action, reward, done):
        next_state = obs
        self.replay_memory.append(
            (self.state, action, reward, next_state, done))

        self.state = next_state
        self.time_step += 1

        if len(self.replay_memory) > config.MEMORY_SIZE:
            self.replay_memory.popleft()

        # Store transitions to replay start size then start training
        if self.time_step > config.OBSERVATION_STEPS:
            self.train()

        if self.time_step % config.SAVE_EVERY_X_STEPS == 0:
            self.actor_network.save_network(self.time_step)
            self.critic_network.save_network(self.time_step)

        # reinit the random process when an episode ends
        if done:
            self.ou_noise.reset()

    def train(self):
        minibatch = random.sample(self.replay_memory, config.MINI_BATCH_SIZE)
        state_batch = [data[0] for data in minibatch]
        action_batch = [data[1] for data in minibatch]
        reward_batch = [data[2] for data in minibatch]
        next_state_batch = [data[3] for data in minibatch]
        action_batch = np.resize(action_batch, [config.MINI_BATCH_SIZE, 1])

        # Calculate y
        y_batch = []
        next_action_batch = self.actor_network.get_target_action_batch(
            next_state_batch)
        q_value_batch = self.critic_network.get_target_q_batch(
            next_state_batch, next_action_batch)

        for i in range(0, config.MINI_BATCH_SIZE):
            done = minibatch[i][4]
            if done:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] +
                               config.FUTURE_REWARD_DISCOUNT *
                               q_value_batch[i])

        y_batch = np.array(y_batch)
        y_batch = np.reshape(y_batch, [len(y_batch), 1])

        # Update critic by minimizing the loss
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch = self.actor_network.get_action_batch(state_batch)
        q_gradient_batch = self.critic_network.get_gradients(
            state_batch, action_batch)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

예제 #37

파일 보기

파일: ddpg.py 프로젝트: songrotek/DDPG-tensorflow

class DDPG:
	def __init__(self, env):
		self.name = 'DDPG' # name for uploading results
		self.environment = env
		
		state_dim = env.observation_space.shape[0]
		action_dim = env.action_space.shape[0]
		# Initialize time step
		self.time_step = 0
		# initialize replay buffer
		self.replay_buffer = deque()
		# initialize networks
		self.create_networks_and_training_method(state_dim,action_dim)

		self.sess = tf.InteractiveSession()
		self.sess.run(tf.initialize_all_variables())

		# loading networks
		self.saver = tf.train.Saver()
		checkpoint = tf.train.get_checkpoint_state("saved_networks")
		if checkpoint and checkpoint.model_checkpoint_path:
				self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
				print "Successfully loaded:", checkpoint.model_checkpoint_path
		else:
				print "Could not find old network weights"

		global summary_writer
		summary_writer = tf.train.SummaryWriter('~/logs',graph=self.sess.graph)
	
	def create_networks_and_training_method(self,state_dim,action_dim):

		theta_p = networks.theta_p(state_dim,action_dim)
		theta_q = networks.theta_q(state_dim,action_dim)
		target_theta_p,target_update_p = self.exponential_moving_averages(theta_p,TAU)
		target_theta_q,target_update_q = self.exponential_moving_averages(theta_q,TAU)

		self.state = tf.placeholder(tf.float32,[None,state_dim],'state')
		self.action_test = networks.policy_network(self.state,theta_p)

		# Initialize a random process the Ornstein-Uhlenbeck process for action exploration
		self.exploration = OUNoise(action_dim)
		noise = self.exploration.noise()
		self.action_exploration = self.action_test + noise

		q = networks.q_network(self.state,self.action_test,theta_q)
		# policy optimization
		mean_q = tf.reduce_mean(q)
		weight_decay_p = tf.add_n([L2_POLICY * tf.nn.l2_loss(var) for var in theta_p])  
		loss_p = -mean_q + weight_decay_p

		optim_p = tf.train.AdamOptimizer(P_LEARNING_RATE)
		grads_and_vars_p = optim_p.compute_gradients(loss_p, var_list=theta_p)
		optimize_p = optim_p.apply_gradients(grads_and_vars_p)
		with tf.control_dependencies([optimize_p]):
			self.train_p = tf.group(target_update_p)

		# q optimization
		self.action_train = tf.placeholder(tf.float32,[None,action_dim],'action_train')
		self.reward = tf.placeholder(tf.float32,[None],'reward')
		self.next_state = tf.placeholder(tf.float32,[None,state_dim],'next_state')
		self.done = tf.placeholder(tf.bool,[None],'done')

		q_train = networks.q_network(self.state,self.action_train,theta_q)
		next_action = networks.policy_network(self.next_state,theta=target_theta_p)
		next_q = networks.q_network(self.next_state,next_action,theta=target_theta_q)
		q_target = tf.stop_gradient(tf.select(self.done,self.reward,self.reward + GAMMA * next_q))

		# q loss
		q_error = tf.reduce_mean(tf.square(q_target - q_train))
		weight_decay_q = tf.add_n([L2_Q * tf.nn.l2_loss(var) for var in theta_q])
		loss_q = q_error + weight_decay_q

		optim_q = tf.train.AdamOptimizer(Q_LEARNING_RATE)
		grads_and_vars_q = optim_q.compute_gradients(loss_q, var_list=theta_q)
		optimize_q = optim_q.apply_gradients(grads_and_vars_q)
		with tf.control_dependencies([optimize_q]):
			self.train_q = tf.group(target_update_q)

		tf.scalar_summary("loss_q",loss_q)
		tf.scalar_summary("loss_p",loss_p)
		tf.scalar_summary("q_mean",mean_q)
		global merged_summary_op
		merged_summary_op = tf.merge_all_summaries()

	def train(self):
		#print "train step",self.time_step
		# Sample a random minibatch of N transitions from replay buffer
		minibatch = random.sample(self.replay_buffer,BATCH_SIZE)
		state_batch = [data[0] for data in minibatch]
		action_batch = [data[1] for data in minibatch]
		reward_batch = [data[2] for data in minibatch]
		next_state_batch = [data[3] for data in minibatch]
		done_batch = [data[4] for data in minibatch]

		_,_,summary_str = self.sess.run([self.train_p,self.train_q,merged_summary_op],feed_dict={
			self.state:state_batch,
			self.action_train:action_batch,
			self.reward:reward_batch,
			self.next_state:next_state_batch,
			self.done:done_batch
			})

		summary_writer.add_summary(summary_str,self.time_step)

		# save network every 1000 iteration
		if self.time_step % 1000 == 0:
			self.saver.save(self.sess, 'saved_networks/' + 'network' + '-ddpg', global_step = self.time_step)

	def noise_action(self,state):
		# Select action a_t according to the current policy and exploration noise
		action = self.sess.run(self.action_exploration,feed_dict={
			self.state:[state]
			})[0]
		return np.clip(action,self.environment.action_space.low,self.environment.action_space.high)

	def action(self,state):
		action = self.sess.run(self.action_test,feed_dict={
			self.state:[state]
			})[0]
		return np.clip(action,self.environment.action_space.low,self.environment.action_space.high)

	def perceive(self,state,action,reward,next_state,done):
		# Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
		self.replay_buffer.append((state,action,reward,next_state,done))
		# Update time step
		self.time_step += 1

		# Limit the replay buffer size
		if len(self.replay_buffer) > REPLAY_BUFFER_SIZE:
			self.replay_buffer.popleft()

		# Store transitions to replay start size then start training
		if self.time_step >  REPLAY_START_SIZE:
			self.train()

		# Re-iniitialize the random process when an episode ends
		if done:
			self.exploration.reset()

	# f fan-in size
	def exponential_moving_averages(self,theta, tau=0.001):
		ema = tf.train.ExponentialMovingAverage(decay=1 - tau)
		update = ema.apply(theta)  # also creates shadow vars
		averages = [ema.average(x) for x in theta]
		return averages, update