def run(comm, env, policy, policy_path, action_bound, optimizer): # rate = rospy.Rate(5) buff = [] global_update = ID_EP global_step = 0 if env.index == 0: env.reset_world() env.store_resetPose() for id in range(MAX_EPISODES): env.generate_goal_point() print("new goal:", env.goal_point, "and rnak:", env.mpi_rank) terminal = False next_ep = False ep_reward = 0 step = 1 env.reset_pose() obs = env.get_laser_observation() if (not LASER_NORM): obs = (obs + 0.5) * 3.5 obs_stack = deque([obs, obs, obs]) goal = np.asarray(env.get_local_goal()) speed = np.asarray(env.get_self_speed()) state = [obs_stack, goal, speed] while not next_ep and not rospy.is_shutdown(): state_list = comm.gather(state, root=0) # generate actions at rank==0 v, a, logprob, scaled_action = generate_action( env=env, state_list=state_list, policy=policy, action_bound=action_bound) # execute actions real_action = comm.scatter(scaled_action, root=0) env.control_vel(real_action) # rate.sleep() rospy.sleep(0.001) # get informtion r, terminal, result = env.get_reward_and_terminate(step) ep_reward += r global_step += 1 if (result == "Reach Goal"): env.generate_goal_point() print("new goal:", env.goal_point) if (terminal): env.reset_pose() obs = env.get_laser_observation() # if(not LASER_NORM): # obs = (obs + 0.5)*3.5 obs_stack = deque([obs, obs, obs]) if (step > EP_LEN): next_ep = True # get next state s_next = env.get_laser_observation() # if(not LASER_NORM): # s_next = (s_next + 0.5) * 3.5 left = obs_stack.popleft() obs_stack.append(s_next) goal_next = np.asarray(env.get_local_goal()) speed_next = np.asarray(env.get_self_speed()) state_next = [obs_stack, goal_next, speed_next] if global_step % HORIZON == 0: state_next_list = comm.gather(state_next, root=0) last_v, _, _, _ = generate_action(env=env, state_list=state_next_list, policy=policy, action_bound=action_bound) # add transitons in buff and update policy r_list = comm.gather(r, root=0) terminal_list = comm.gather(terminal, root=0) if env.mpi_rank == 0: buff.append((state_list, a, r_list, terminal_list, logprob, v)) if len(buff) > HORIZON - 1: s_batch, goal_batch, speed_batch, a_batch, r_batch, d_batch, l_batch, v_batch = \ transform_buffer(buff=buff) t_batch, advs_batch = generate_train_data( rewards=r_batch, gamma=GAMMA, values=v_batch, last_value=last_v, dones=d_batch, lam=LAMDA) memory = (s_batch, goal_batch, speed_batch, a_batch, l_batch, t_batch, v_batch, r_batch, advs_batch) print("PPO update start") ppo_update_stage1(policy=policy, optimizer=optimizer, batch_size=BATCH_SIZE, memory=memory, epoch=EPOCH, coeff_entropy=COEFF_ENTROPY, clip_value=CLIP_VALUE, num_step=HORIZON, num_env=NUM_ENV, frames=LASER_HIST, obs_size=OBS_SIZE, act_size=ACT_SIZE) print("PPO update finish") buff = [] global_update += 1 step += 1 state = state_next if env.mpi_rank == 0: if global_update != 0 and global_update % 20 == 0: torch.save(policy.state_dict(), policy_path + '/Stage1_{}'.format(global_update)) logger.info( '########################## model saved when update {} times#########' '################'.format(global_update)) # distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 + (env.goal_point[1]-env.init_pose[1])**2) distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 + (env.goal_point[1] - env.init_pose[1])**2) logger.info('Env %02d, Goal (%05.1f, %05.1f), Episode %05d, setp %03d, Reward %-5.1f, Distance %05.1f, %s' % \ (env.mpi_rank, env.goal_point[0], env.goal_point[1], id + 1, step, ep_reward, distance, result)) print("befpre ep_reward") logger_cal.info(ep_reward)
def run(comm, env, policy, policy_path, action_bound, optimizer): # rate = rospy.Rate(5) buff = [] global_update = 0 global_step = 0 accumulated_social_reward = 0 if env.index == 0: env.reset_world() for id in range(MAX_EPISODES): env.reset_pose() env.generate_goal_point() terminal = False ep_reward = 0 accumulated_social_reward = 0 step = 1 obs = env.get_laser_observation() obs_stack = deque([obs, obs, obs]) goal = np.asarray(env.get_local_goal()) speed = np.asarray(env.get_self_speed()) state = [obs_stack, goal, speed] while not terminal and not rospy.is_shutdown(): state_list = comm.gather(state, root=0) # generate actions at rank==0 v, a, logprob, scaled_action = generate_action( env=env, state_list=state_list, policy=policy, action_bound=action_bound) # execute actions real_action = comm.scatter(scaled_action, root=0) env.control_vel(real_action) # rate.sleep() rospy.sleep(0.001) # get informtion r, terminal, result, social_reward = env.get_reward_and_terminate( step, SOCIAL_CALC_MIN_DISTANCE, SOCIAL_CALC_DELTA_T, SOCIAL_REWARD_COEFF) ep_reward += r global_step += 1 accumulated_social_reward += social_reward # get next state s_next = env.get_laser_observation() left = obs_stack.popleft() obs_stack.append(s_next) goal_next = np.asarray(env.get_local_goal()) speed_next = np.asarray(env.get_self_speed()) state_next = [obs_stack, goal_next, speed_next] if global_step % HORIZON == 0: state_next_list = comm.gather(state_next, root=0) last_v, _, _, _ = generate_action(env=env, state_list=state_next_list, policy=policy, action_bound=action_bound) # add transitons in buff and update policy r_list = comm.gather(r, root=0) terminal_list = comm.gather(terminal, root=0) if env.index == 0: buff.append((state_list, a, r_list, terminal_list, logprob, v)) if len(buff) > HORIZON - 1: s_batch, goal_batch, speed_batch, a_batch, r_batch, d_batch, l_batch, v_batch = \ transform_buffer(buff=buff) t_batch, advs_batch = generate_train_data( rewards=r_batch, gamma=GAMMA, values=v_batch, last_value=last_v, dones=d_batch, lam=LAMDA) memory = (s_batch, goal_batch, speed_batch, a_batch, l_batch, t_batch, v_batch, r_batch, advs_batch) ppo_update_stage1(policy=policy, optimizer=optimizer, batch_size=BATCH_SIZE, memory=memory, epoch=EPOCH, coeff_entropy=COEFF_ENTROPY, clip_value=CLIP_VALUE, num_step=HORIZON, num_env=NUM_ENV, frames=LASER_HIST, obs_size=OBS_SIZE, act_size=ACT_SIZE) buff = [] global_update += 1 step += 1 state = state_next if env.index == 0: if global_update != 0 and global_update % 20 == 0: torch.save(policy.state_dict(), policy_path + '/Stage1_{}'.format(global_update)) logger.info( '########################## model saved when update {} times#########' '################'.format(global_update)) distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 + (env.goal_point[1] - env.init_pose[1])**2) logger.info('Env %02d, Goal (%05.1f, %05.1f), Episode %05d, setp %03d, Reward %-5.1f, Distance %05.1f, %s, ' % \ (env.index, env.goal_point[0], env.goal_point[1], id + 1, step, ep_reward, distance, result)) logger_cal.info(ep_reward) # log social reward logger_social.info("{},{}".format(env.index, accumulated_social_reward))
def run(env, policy, policy_path, action_bound, optimizer): # rate = rospy.Rate(5) buff = [] global_update = 0 global_step = 0 if env.index == 0: env.reset_world() env.store_resetPose() for id in range(MAX_EPISODES): env.reset_pose() # env.generate_goal_point() terminal = False next_ep = False ep_reward = 0 step = 1 # obs = env.get_laser_observation() # obs_stack = deque([obs, obs, obs]) ad_speed = np.asarray(env.get_self_speed()) ag_pos_and_goal = np.asarray(env.get_local_ag_pos_and_goal()) state = [ad_speed, ag_pos_and_goal] while not terminal and not rospy.is_shutdown(): #state_list = comm.gather(state, root=0) state_list = [state] # generate actions at rank==0 v, a, logprob, scaled_action = generate_action_adPush( env=env, state_list=state_list, policy=policy, action_bound=action_bound) # execute actions #real_action = comm.scatter(scaled_action, root=0) real_action = scaled_action[0] env.control_vel(real_action) # rate.sleep() rospy.sleep(0.001) # get informtion r, terminal, result = env.get_reward_and_terminate(step) ep_reward += r global_step += 1 # if (terminal): # env.reset_pose() # if(step > EP_LEN): # next_ep = True # get next state # s_next = env.get_laser_observation() # left = obs_stack.popleft() # obs_stack.append(s_next) # goal_next = np.asarray(env.get_local_goal()) # speed_next = np.asarray(env.get_self_speed()) # state_next = [obs_stack, goal_next, speed_next] ad_speed = np.asarray(env.get_self_speed()) ag_pos_and_goal = np.asarray(env.get_local_ag_pos_and_goal()) state_next = [ad_speed, ag_pos_and_goal] if global_step % HORIZON == 0: #state_next_list = comm.gather(state_next, root=0) state_next_list = [state_next] last_v, _, _, _ = generate_action_adPush( env=env, state_list=state_next_list, policy=policy, action_bound=action_bound) # add transitons in buff and update policy #r_list = comm.gather(r, root=0) r_list = [r] #terminal_list = comm.gather(terminal, root=0) terminal_list = [terminal] if env.mpi_rank == 0: buff.append((state_list, a, r_list, terminal_list, logprob, v)) if len(buff) > HORIZON - 1: goal_batch, speed_batch, a_batch, r_batch, d_batch, l_batch, v_batch = \ transform_buffer_adPush(buff=buff) t_batch, advs_batch = generate_train_data( rewards=r_batch, gamma=GAMMA, values=v_batch, last_value=last_v, dones=d_batch, lam=LAMDA) memory = (goal_batch, speed_batch, a_batch, l_batch, t_batch, v_batch, r_batch, advs_batch) ppo_update_adversayPush(policy=policy, optimizer=optimizer, batch_size=BATCH_SIZE, memory=memory, epoch=EPOCH, coeff_entropy=COEFF_ENTROPY, clip_value=CLIP_VALUE, num_step=HORIZON, num_env=NUM_ENV, frames=LASER_HIST, obs_size=OBS_SIZE, act_size=ACT_SIZE) buff = [] global_update += 1 step += 1 state = state_next if env.mpi_rank == 0: if TRAIN: if global_update != 0 and global_update % 20 == 0: torch.save( policy.state_dict(), policy_path + '/Stage1_{}'.format(global_update)) logger.info( '########################## model saved when update {} times#########' '################'.format(global_update)) # distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 + (env.goal_point[1]-env.init_pose[1])**2) #distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 + (env.goal_point[1]-env.init_pose[1])**2) distance = 0 if TRAIN: logger.info('Env %02d, Goal (%05.1f, %05.1f), Episode %05d, setp %03d, Reward %-5.1f, Distance %05.1f, %s' % \ (env.mpi_rank, 0, 3, id + 1, step, ep_reward, distance, result)) logger_cal.info(ep_reward)
def run(comm, env, policy, policy_path, action_bound, optimizer): rate = rospy.Rate(40) buff = [] global_update = 0 global_step = 0 step = 0 ep_reward=0 # import pdb; pdb.set_trace() if env.index == 0: # import pdb; pdb.set_trace() env.reset_world() for id in range(MAX_EPISODES): # env.reset_pose() env.generate_goal_point() obs = env.get_laser_observation() obs_stack = deque([obs, obs, obs]) goal = np.asarray(env.get_local_goal()) speed = np.asarray(env.get_self_speed()) state = [obs_stack, goal, speed] while not rospy.is_shutdown(): state_list = comm.gather(state, root=0) # generate actions at rank==0 v, a, logprob, scaled_action=generate_action(env=env, state_list=state_list, policy=policy, action_bound=action_bound) # execute actions real_action = comm.scatter(scaled_action, root=0) # if liveflag == True: # print(real_action) env.control_vel(real_action) # rate.sleep() rospy.sleep(0.001) # get informtion r, terminal, result = env.get_reward_and_terminate(step) print('r: ', r) # r, terminal, result = 1,0,1 #env.get_reward_and_terminate(step) step += 1 ep_reward += r if terminal == True: print('done:, ', result) # import pdb; pdb.set_trace() env.reset_world() ep_reward=0 step = 1 global_step += 1 # get next state s_next = env.get_laser_observation() left = obs_stack.popleft() obs_stack.append(s_next) goal_next = np.asarray(env.get_local_goal()) speed_next = np.asarray(env.get_self_speed()) state_next = [obs_stack, goal_next, speed_next] if global_step % HORIZON == 0: state_next_list = comm.gather(state_next, root=0) last_v, _, _, _ = generate_action(env=env, state_list=state_next_list, policy=policy, action_bound=action_bound) # add transitons in buff and update policy r_list = comm.gather(r, root=0) terminal_list = comm.gather(terminal, root=0) terminal_list = comm.bcast(terminal_list, root=0) if env.index == 0: buff.append((state_list, a, r_list, terminal_list, logprob, v)) if len(buff) > HORIZON - 1: s_batch, goal_batch, speed_batch, a_batch, r_batch, d_batch, l_batch, v_batch = \ transform_buffer(buff=buff) filter_index = get_filter_index(d_batch) # print len(filter_index) t_batch, advs_batch = generate_train_data(rewards=r_batch, gamma=GAMMA, values=v_batch, last_value=last_v, dones=d_batch, lam=LAMDA) memory = (s_batch, goal_batch, speed_batch, a_batch, l_batch, t_batch, v_batch, r_batch, advs_batch) # import pdb; pdb.set_trace() ppo_update_stage2(policy=policy, optimizer=optimizer, batch_size=BATCH_SIZE, memory=memory, filter_index=filter_index, epoch=EPOCH, coeff_entropy=COEFF_ENTROPY, clip_value=CLIP_VALUE, num_step=HORIZON, num_env=NUM_ENV, frames=LASER_HIST, obs_size=OBS_SIZE, act_size=ACT_SIZE) buff = [] global_update += 1 state = state_next if env.index == 0: if global_update != 0 and global_update % 20 == 0: torch.save(policy.state_dict(), policy_path + '/gazebo_{}.pth'.format(global_update)) logger.info('########################## model saved when update {} times#########' '################'.format(global_update)) logger.info('Env %02d, Goal (%05.1f, %05.1f), Episode %05d, setp %03d, Reward %-5.1f, %s,' % \ (env.index, env.goal_point[0], env.goal_point[1], id, step-1, ep_reward, result)) logger_cal.info(ep_reward)
def run(comm, env, policy, policy_path, action_bound, optimizer): # rate = rospy.Rate(5) buff = [] global_update = 0 global_step = 0 if env.index == 0: env.reset_world() env.store_resetPose() for id in range(MAX_EPISODES): env.reset_pose() terminal = False next_ep = False ep_reward = 0 step = 1 STATE_NOISE_STD, SPEED_SCALE = get_factors( func_index=FUN_INDEX, ep_lenth=id, std_bound=STD_BOUND, speed_bound=SPEED_BOUND, ) print("ep: %d, std: %f, speed: %f" % (id, STATE_NOISE_STD, SPEED_SCALE)) noise = np.random.normal(0, STATE_NOISE_STD, 8) goal = np.asarray(env.get_local_pos()) speed = np.asarray(env.get_speeds()) state = [goal + noise] while not next_ep and not rospy.is_shutdown(): state_list = comm.gather(state, root=0) # generate actions at rank==0 v, a, logprob, scaled_action = generate_action_adGeneralV2( env=env, state_list=state_list, policy=policy, action_bound=action_bound) # execute actions real_action = comm.scatter(scaled_action, root=0) noise = np.random.normal(0, STATE_NOISE_STD, 2) print("noise:", noise) real_action = real_action + np.abs(noise) env.control_vel(real_action * SPEED_SCALE) # rate.sleep() rospy.sleep(0.001) # get informtion r, terminal, result = env.get_reward_and_terminate(step) ep_reward += r global_step += 1 if (terminal): env.reset_pose() if (step > EP_LEN): next_ep = True # get next state noise = np.random.normal(0, STATE_NOISE_STD, 8) goal_next = np.asarray(env.get_local_pos()) speed_next = np.asarray(env.get_speeds()) state_next = [goal_next + noise] if global_step % HORIZON == 0: state_next_list = comm.gather(state_next, root=0) last_v, _, _, _ = generate_action_adGeneralV2( env=env, state_list=state_next_list, policy=policy, action_bound=action_bound) # add transitons in buff and update policy r_list = comm.gather(r, root=0) terminal_list = comm.gather(terminal, root=0) if env.mpi_rank == 0: buff.append((state_list, a, r_list, terminal_list, logprob, v)) if len(buff) > HORIZON - 1: goal_batch, a_batch, r_batch, d_batch, l_batch, v_batch = transform_buffer_adGeneralV2( buff=buff) t_batch, advs_batch = generate_train_data( rewards=r_batch, gamma=GAMMA, values=v_batch, last_value=last_v, dones=d_batch, lam=LAMDA) memory = (goal_batch, a_batch, l_batch, t_batch, v_batch, r_batch, advs_batch) if (TRAIN): ppo_update_adversayGeneralV2( policy=policy, optimizer=optimizer, batch_size=BATCH_SIZE, memory=memory, epoch=EPOCH, coeff_entropy=COEFF_ENTROPY, clip_value=CLIP_VALUE, num_step=HORIZON, num_env=NUM_ENV, frames=LASER_HIST, obs_size=OBS_SIZE, act_size=ACT_SIZE) buff = [] global_update += 1 step += 1 state = state_next if env.mpi_rank == 0: if (TRAIN): if global_update != 0 and global_update % 20 == 0: torch.save( policy.state_dict(), policy_path + '/Stage1_{}'.format(global_update)) logger.info( '########################## model saved when update {} times#########' '################'.format(global_update)) distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 + (env.goal_point[1] - env.init_pose[1])**2) if TRAIN: logger.info('Env %02d, Goal (%05.1f, %05.1f), Episode %05d, setp %03d, Reward %-5.1f, Distance %05.1f, %s' % \ (env.mpi_rank, env.goal_point[0], env.goal_point[1], id + 1, step, ep_reward, distance, result)) logger_cal.info(ep_reward)
def run(comm, env, policy, policy_path, action_bound, optimizer): # rate = rospy.Rate(5) buff = [] global_update = 0 global_step = 0 #world reset if env.index == 0: env.reset_world() for id in range(MAX_EPISODES): #reset env.reset_pose() env.generate_goal_point() terminal = False ep_reward = 0 step = 1 # get_state state = env.get_state() while not terminal and not rospy.is_shutdown(): state_list = comm.gather(state, root=0) ## get_action #------------------------------------------------------------------------- # generate actions at rank==0 v, a, logprob, scaled_action=generate_action(env=env, state_list=state_list, policy=policy, action_bound=action_bound) # execute actions real_action = comm.scatter(scaled_action, root=0) #------------------------------------------------------------------------- ### step ############################################################ state_next, r, terminal, result = env.step(real_action, step) ep_reward += r global_step += 1 # add transitons in buff and update policy r_list = comm.gather(r, root=0) terminal_list = comm.gather(terminal, root=0) ######################################################################## if global_step % HORIZON == 0: state_next_list = comm.gather(state_next, root=0) last_v, _, _, _ = generate_action(env=env, state_list=state_next_list, policy=policy, action_bound=action_bound) ## training #------------------------------------------------------------------------- if env.index == 0: buff.append((state_list, a, r_list, terminal_list, logprob, v)) if len(buff) > HORIZON - 1: s_batch, goal_batch, speed_batch, a_batch, r_batch, d_batch, l_batch, v_batch = \ transform_buffer(buff=buff) t_batch, advs_batch = generate_train_data(rewards=r_batch, gamma=GAMMA, values=v_batch, last_value=last_v, dones=d_batch, lam=LAMDA) memory = (s_batch, goal_batch, speed_batch, a_batch, l_batch, t_batch, v_batch, r_batch, advs_batch) ppo_update_stage1(policy=policy, optimizer=optimizer, batch_size=BATCH_SIZE, memory=memory, epoch=EPOCH, coeff_entropy=COEFF_ENTROPY, clip_value=CLIP_VALUE, num_step=HORIZON, num_env=NUM_ENV, frames=LASER_HIST, obs_size=OBS_SIZE, act_size=ACT_SIZE) buff = [] global_update += 1 step += 1 state = state_next if env.index == 0: if global_update != 0 and global_update % 20 == 0: torch.save(policy.state_dict(), policy_path + '/Stage1_{}'.format(global_update)) logger.info('########################## model saved when update {} times#########' '################'.format(global_update)) distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 + (env.goal_point[1]-env.init_pose[1])**2) logger.info('Env %02d, Goal (%05.1f, %05.1f), Episode %05d, setp %03d, Reward %-5.1f, Distance %05.1f, %s' % \ (env.index, env.goal_point[0], env.goal_point[1], id + 1, step, ep_reward, distance, result)) logger_cal.info(ep_reward)