示例#1
0
def run(comm, env, policy, critic, critic_opt, critic_target, policy_path,
        action_bound, optimizer):

    buff = []
    global_update = 0
    global_step = 0

    # world reset
    if env.index == 0:
        env.reset_world()

    memory_position = 0
    update = 0

    # replay_memory
    replay_memory = ReplayMemory(REPLAY_SIZE, SEED)

    for id in range(MAX_EPISODES):

        # reset
        env.reset_pose()

        terminal = False
        ep_reward = 0
        step = 1

        # generate goal
        env.generate_goal_point()

        # get_state
        obs = env.get_laser_observation()
        obs_stack = deque([obs, obs, obs])
        goal = np.asarray(env.get_local_goal())
        speed = np.asarray(env.get_self_speed())
        state = [obs_stack, goal, speed]

        # episode 1
        while not terminal and not rospy.is_shutdown():

            state_list = comm.gather(state, root=0)

            ## get_action
            #-------------------------------------------------------------------------
            # generate actions at rank==0
            a, logprob, scaled_action = generate_action(
                env=env,
                state_list=state_list,
                policy=policy,
                action_bound=action_bound)

            # execute actions
            #-------------------------------------------------------------------------
            real_action = comm.scatter(scaled_action, root=0)

            ## run action
            #-------------------------------------------------------------------------
            env.control_vel(real_action)

            rospy.sleep(0.001)

            ## get reward
            #-------------------------------------------------------------------------
            # get informtion
            r, terminal, result = env.get_reward_and_terminate(step)
            ep_reward += r
            global_step += 1

            #-------------------------------------------------------------------------

            # get next state
            #-------------------------------------------------------------------------
            s_next = env.get_laser_observation()
            left = obs_stack.popleft()

            obs_stack.append(s_next)
            goal_next = np.asarray(env.get_local_goal())
            speed_next = np.asarray(env.get_self_speed())
            state_next = [obs_stack, goal_next, speed_next]

            #-------------------------------------------------------------------------

            r_list = comm.gather(r, root=0)
            terminal_list = comm.gather(terminal, root=0)
            state_next_list = comm.gather(state_next, root=0)

            ## training
            #-------------------------------------------------------------------------
            if env.index == 0:

                # add data in replay_memory
                #-------------------------------------------------------------------------
                replay_memory.push(state[0], state[1], state[2], a, logprob,
                                   r_list, state_next[0], state_next[1],
                                   state_next[2], terminal_list)
                if len(replay_memory) > BATCH_SIZE:

                    ## update
                    #-------------------------------------------------------------------------
                    update = sac_update_stage(policy=policy,
                                              optimizer=optimizer,
                                              critic=critic,
                                              critic_opt=critic_opt,
                                              critic_target=critic_target,
                                              batch_size=BATCH_SIZE,
                                              memory=replay_memory,
                                              epoch=EPOCH,
                                              replay_size=REPLAY_SIZE,
                                              tau=TAU,
                                              alpha=ALPHA,
                                              gamma=GAMMA,
                                              updates=update,
                                              update_interval=UPDATE_INTERVAL,
                                              num_step=BATCH_SIZE,
                                              num_env=NUM_ENV,
                                              frames=LASER_HIST,
                                              obs_size=OBS_SIZE,
                                              act_size=ACT_SIZE)

                    buff = []
                    global_update += 1
                    update = update

            step += 1
            state = state_next

        ## save policy
        #--------------------------------------------------------------------------------------------------------------
        if env.index == 0:
            if global_update != 0 and global_update % 1000 == 0:
                torch.save(policy.state_dict(),
                           policy_path + '/policy_{}'.format(global_update))
                torch.save(critic.state_dict(),
                           policy_path + '/critic_{}'.format(global_update))
                logger.info(
                    '########################## model saved when update {} times#########'
                    '################'.format(global_update))
        distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 +
                           (env.goal_point[1] - env.init_pose[1])**2)

        logger.info('Env %02d, Goal (%05.1f, %05.1f), Episode %05d, setp %03d, Reward %-5.1f, Distance %05.1f, %s' % \
                    (env.index, env.goal_point[0], env.goal_point[1], id + 1, step, ep_reward, distance, result))
        logger_cal.info(ep_reward)
def run(comm, env, policy, policy_path, action_bound, optimizer):

    actor, actor_target, critic, critic_target = policy
    actor_opt, critic_opt = optimizer

    # set OU noise
    noise_c = OUNoise(action_dim=ACT_SIZE, action_bound=action_bound)
    noise_c.reset()

    buff = []
    global_update = 0
    global_step = 0

    replay_memory = ReplayMemory(REPLAY_SIZE, SEED)

    #world reset
    if env.index == 0:
        env.reset_world()

    for id in range(MAX_EPISODES):

        #reset
        env.reset_pose()

        terminal = False
        ep_reward = 0
        step = 1

        # generate goal
        env.generate_goal_point()

        # get_state
        obs = env.get_laser_observation()
        obs_stack = deque([obs, obs, obs])
        goal = np.asarray(env.get_local_goal())
        speed = np.asarray(env.get_self_speed())
        state = [obs_stack, goal, speed]

        # episode 1
        while not terminal and not rospy.is_shutdown():

            state_list = comm.gather(state, root=0)

            ## get_action
            #-------------------------------------------------------------------------
            # generate actions at rank==0
            mean, action = generate_action(env=env,
                                           state_list=state_list,
                                           actor=actor,
                                           action_bound=action_bound)

            ## exploration
            #--------------------------------------------------------------------------
            a = noise_c.get_action(mean, step)
            #noise = np.random.normal(0, exploration_noise, size=2) #action size check
            #a = mean + noise
            #a = a.clip(action_bound[0], action_bound[1])

            # execute actions
            #-------------------------------------------------------------------------
            real_action = comm.scatter(a, root=0)

            ## run action
            #-------------------------------------------------------------------------
            env.control_vel(real_action)

            # rate.sleep()
            rospy.sleep(0.001)

            ## get reward
            #-------------------------------------------------------------------------
            # get informtion
            r, terminal, result = env.get_reward_and_terminate(step)

            ep_reward += r
            global_step += 1

            #-------------------------------------------------------------------------
            # get next state
            #-------------------------------------------------------------------------

            s_next = env.get_laser_observation()
            left = obs_stack.popleft()
            obs_stack.append(s_next)
            goal_next = np.asarray(env.get_local_goal())
            speed_next = np.asarray(env.get_self_speed())
            state_next = [obs_stack, goal_next, speed_next]

            r_list = comm.gather(r, root=0)
            terminal_list = comm.gather(terminal, root=0)
            state_next_list = comm.gather(state_next, root=0)

            #-------------------------------------------------------------------------
            ## training
            #-------------------------------------------------------------------------
            if env.index == 0:

                ## save data in memory
                #-------------------------------------------------------------------------
                replay_memory.push(state[0], state[1], state[2], a, r_list,
                                   state_next[0], state_next[1], state_next[2],
                                   terminal_list)
                policy_list = [actor, actor_target, critic, critic_target]
                optimizer_list = [actor_opt, critic_opt]

                if len(replay_memory) > BATCH_SIZE:

                    ## Update step by step
                    #-------------------------------------------------------------------------
                    ddpg_update_stage(policy=policy_list,
                                      optimizer=optimizer_list,
                                      batch_size=BATCH_SIZE,
                                      memory=replay_memory,
                                      epoch=EPOCH,
                                      replay_size=REPLAY_SIZE,
                                      gamma=GAMMA,
                                      num_step=BATCH_SIZE,
                                      num_env=NUM_ENV,
                                      frames=LASER_HIST,
                                      obs_size=OBS_SIZE,
                                      act_size=ACT_SIZE,
                                      tau=TAU)
                    global_update += 1

            step += 1
            state = state_next

        ## save policy and log
        #-------------------------------------------------------------------------
        if env.index == 0:
            if global_update != 0 and global_update % 3000 == 0:
                torch.save(actor.state_dict(),
                           policy_path + '/actor_{}'.format(global_update))
                torch.save(critic.state_dict(),
                           policy_path + '/critic_{}'.format(global_update))

                logger.info(
                    '########################## model saved when update {} times#########'
                    '################'.format(global_update))
        distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 +
                           (env.goal_point[1] - env.init_pose[1])**2)

        logger.info('Env %02d, Goal (%05.1f, %05.1f), Episode %05d, setp %03d, Reward %-5.1f, Distance %05.1f, %s' % \
                    (env.index, env.goal_point[0], env.goal_point[1], id + 1, step, ep_reward, distance, result))
        logger_cal.info(ep_reward)
def run(comm, env, agent, policy_path, args):

    # Training Loop
    test_interval = 10
    save_interval = 100

    total_numsteps = 0
    updates = 0

    # world reset
    if env.index == 0: # step
        env.reset_gazebo_simulation()
        #Tesnorboard
        writer = SummaryWriter('test_runs/' + policy_path)
        

    # replay_memory     
    memory = ReplayMemory(args.replay_size, args.seed)

    

    for i_episode in range(args.num_steps):

        episode_reward = 0
        episode_steps = 0
        done = False 

        # Env reset
        env.set_gazebo_pose()
        # generate goal
        env.generate_goal_point_gazebo()    
        
        # Get initial state
        frame = env.get_laser_observation()
        frame_stack = deque([frame, frame, frame])
        goal = np.asarray(env.get_local_goal())
        speed = np.asarray(env.get_self_speed())
        state = [frame_stack, goal, speed]

        # Episode start
        while not done and not rospy.is_shutdown():    
            state_list = comm.gather(state, root=0)

            if env.index == 0:
                action = agent.select_action(state_list)
            else:
                action = None
            
            if env.index == 0:
                if len(memory) > args.batch_size:
                    # Number of updates per step in environment
                    for i in range(args.updates_per_step):
                        # Update parameters of all the networks
                        critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(memory, args.batch_size, updates)

                        writer.add_scalar('loss/critic_1', critic_1_loss, updates)
                        writer.add_scalar('loss/critic_2', critic_2_loss, updates)
                        writer.add_scalar('loss/policy', policy_loss, updates)
                        writer.add_scalar('loss/entropy_loss', ent_loss, updates)
                        writer.add_scalar('entropy_temprature/alpha', alpha, updates)
                        updates += 1
                 
            # Execute actions
            #-------------------------------------------------------------------------            
            action_clip_bound = [[0, -1], [1, 1]] #### Action maximum, minimum values
            cliped_action = np.clip(action, a_min=action_clip_bound[0], a_max=action_clip_bound[1])
            real_action = comm.scatter(cliped_action, root=0)    
            
            #if real_action[0] >= 0.02 and real_action[0] <= 0.2:
            #    real_action[0] = real_action[0] / 0.6
            if real_action[0] < 0.10 and real_action[0] > 0:
                real_action[0] = 0.1
                

            if real_action[1] > 0 and real_action[1] < 0.15:
                real_action[1] = 0.0

            elif real_action[1] < 0 and real_action[1] > -0.15:
                real_action[1] = 0.0
            
            env.control_vel(real_action)

            #rospy.sleep(0.001)

            ## Get reward and terminal state
            reward, done, result = env.get_reward_and_terminate(episode_steps)
            #print("Action : [{}, {}], Distance : {}, Reward : {}".format(real_action[0], real_action[1], env.distance, reward))

            episode_steps += 1
            total_numsteps += 1
            episode_reward += reward

            # Get next state
            next_frame = env.get_laser_observation()
            left = frame_stack.popleft()
            
            frame_stack.append(next_frame)
            next_goal = np.asarray(env.get_local_goal())
            next_speed = np.asarray(env.get_self_speed())
            next_state = [frame_stack, next_goal, next_speed]

            r_list = comm.gather(reward, root=0)
            done_list = comm.gather(done, root=0)
            next_state_list = comm.gather(next_state, root=0)

            if env.index == 0:
                #meomry.list_push(state_list, action, r_list, next_state_list, done_list)
                for i in range(np.asarray(state_list).shape[0]):
                    memory.push(state_list[i][0], state_list[i][1], state_list[i][2], action[i], r_list[i], next_state_list[i][0], next_state_list[i][1], next_state_list[i][2], done_list[i]) # Append transition to memory

            state = next_state  
        

        #if total_numsteps > args.num_steps:
        #    break
        
        if env.index == 0:
            writer.add_scalar('reward/train', episode_reward, i_episode)


        if env.index == 0:
            #if global_update != 0 and global_update % 20 == 0:
            if i_episode != 0 and i_episode % save_interval == 0:
    
                torch.save(agent.policy.state_dict(), policy_path + '/policy_epi_{}'.format(i_episode))
                print('########################## policy model saved when update {} times#########'
                            '################'.format(i_episode))
                torch.save(agent.critic_1.state_dict(), policy_path + '/critic_1_epi_{}'.format(i_episode))
                print('########################## critic model saved when update {} times#########'
                            '################'.format(i_episode))
                torch.save(agent.critic_2.state_dict(), policy_path + '/critic_2_epi_{}'.format(i_episode))
                print('########################## critic model saved when update {} times#########'
                            '################'.format(i_episode))

        distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 + (env.goal_point[1]-env.init_pose[1])**2)
        print("Env: {}, memory_size: {}, Goal: ({} , {}), Episode: {}, steps: {}, Reward: {}, Distance: {}, {}".format(env.index, len(memory), round(env.goal_point[0],2), round(env.goal_point[1],2), i_episode+1, episode_steps, round(episode_reward, 2), round(distance, 2), result))