def __init__(self): self.observation_reward = rospy.Subscriber("/rl/environment_response", reward_observation, self.policy, queue_size=10) self.act_pub = rospy.Publisher("/rl/final_action", action_agent, queue_size=10) self.prev_state = None self.state = None self.reward = None self.final_state = None self.agent = SAC()
def run(args): env = gym.make(args.env) device = torch.device(args.device) # 1. Set some necessary seed. torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) random.seed(args.seed) env.seed(args.seed) # 2. Create nets. state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] hidden_sizes = (256, 256) ac = ActorCritic(state_size, action_size, hidden_sizes).to(device) ac_target = ActorCritic(state_size, action_size, hidden_sizes).to(device) hard_update(ac, ac_target) # env_sampler = EnvSampler(env, max_episode_step=4000, capacity=1e6) env_sampler = EnvSampler2(env, gamma=args.gamma1, capacity=1e6) alg = SAC(ac, ac_target, gamma=args.gamma2, alpha=0.2, q_lr=1e-3, pi_lr=1e-3, target_lr=5e-3, device=device) def get_action(state): state = torch.FloatTensor(state).unsqueeze(0).to(device) return ac_target.get_action(state) def get_mean_action(state): state = torch.FloatTensor(state).unsqueeze(0).to(device) return ac_target.get_action(state, deterministic=True) start_time = time() for _ in range(args.start_steps): env_sampler.addSample() print("Warmup uses {}s.".format(time() - start_time)) for step in range(1, args.total_steps + 1): env_sampler.addSample(get_action) if step % args.update_every == 0: for _ in range(args.update_every): batch = env_sampler.sample(args.batch_size) losses = alg.update(*batch) if step % args.test_every == 0: test_reward = env_sampler.test(get_mean_action) yield (step, test_reward, *losses) torch.save(ac.pi.state_dict(), './env_{}_pi_net.pth.tar'.format(args.env))
def main(args): env = gym.make('Carla-v0', n_heroes=N_HEROES, port=PORT) replay = MultiReplayBuffer(CAPACITY) from sac import SAC import torch import bz_utils as bzu bzu.log.init('log_v1') updates = 0 trainer = SAC(OBSERVATION_SHAPE, N_ACTIONS, args) agent = trainer.policy # agent.load_state_dict(torch.load('log/latest.t7')) for _ in tqdm.tqdm(range(1000)): totals = [0 for _ in range(N_HEROES)] finished = list() states = env.reset(n_vehicles=N_VEHICLES, n_pedestrians=N_PEDESTRIANS) for i in tqdm.tqdm(range(1000), desc='Experiences'): _, _, actions = agent.sample(preprocess(states)) actions = actions.detach().cpu().numpy() new_states, rewards, dones, infos = env.step(actions) for j in range(N_HEROES): totals[j] += rewards[j] if dones[j]: finished.append(totals[j]) totals[j] = 0 # env.render() replay.add(states, actions, rewards, new_states, dones) states = new_states for j in range(N_HEROES): totals[j] += rewards[j] finished.append(totals[j]) bzu.log.scalar(is_train=True, **{'cumulative': np.mean(finished)}) for i in tqdm.tqdm(range(1000), desc='Batch'): loss_q1, loss_q2, p_loss, a_loss, a_tlog = trainer.update_parameters( replay, args.batch_size, updates) scalars = { 'loss_q1': loss_q1, 'loss_q2': loss_q2, 'p_loss': p_loss, 'a_loss': a_loss, 'a_tlog': a_tlog, } bzu.log.scalar(is_train=True, **scalars) updates += 1 bzu.log.end_epoch(agent)
def ros_init(self): if self.team == 'A': self.agent = SAC(act_dim=2, obs_dim=6, lr_actor=l_rate*(1e-3), lr_value=l_rate*(1e-3), gamma=0.99, tau=0.995) rospy.init_node('strategy_node_A', anonymous=True) # self.A_info_pub = rospy.Publisher('/nubot1/A_info', Float32MultiArray, queue_size=1) # 3in1 self.vel_pub = rospy.Publisher('/nubot1/nubotcontrol/velcmd', VelCmd, queue_size=1) self.reset_pub = rospy.Publisher('/gazebo/set_model_state', ModelState, queue_size=10) # self.ready2restart_pub = rospy.Publisher('nubot1/ready2restart',Bool, queue_size=1) rospy.Subscriber("/nubot1/omnivision/OmniVisionInfo", OminiVisionInfo, self.callback) rospy.Subscriber('gazebo/model_states', ModelStates, self.fly_callback) # rospy.Subscriber('/coach/state', String, self.state_callback) # rospy.Subscriber('/coach/reward', Float32, self.reward_callback) # rospy.Subscriber('/coach/done', Bool, self.done_callback) # rospy.Subscriber('coach/HowEnd', Int16, self.HowEnd_callback) # rospy.Subscriber("/rival1/steal", Bool, self.steal_callback) rospy.wait_for_service('/nubot1/Shoot') self.call_Shoot = rospy.ServiceProxy('/nubot1/Shoot', Shoot) # rospy.wait_for_service('/gazebo/reset_simulation') # self.call_restart = rospy.ServiceProxy('/gazebo/reset_simulation', Empty, persistent=True) # rospy.wait_for_service('/gazebo/set_model_state') # self.call_set_modol = rospy.ServiceProxy('/gazebo/set_model_state', SetModelState) rospy.wait_for_service('/nubot1/BallHandle') self.call_Handle = rospy.ServiceProxy('/nubot1/BallHandle', BallHandle) rospy.wait_for_service('/rival1/BallHandle') self.call_B_Handle = rospy.ServiceProxy('/rival1/BallHandle', BallHandle) elif self.team == 'B': rospy.init_node('strategy_node_B', anonymous=True) self.vel_pub = rospy.Publisher('/rival1/nubotcontrol/velcmd', VelCmd, queue_size=1) self.steal_pub = rospy.Publisher('/rival1/steal', Bool, queue_size=1) # steal rospy.Subscriber("/rival1/omnivision/OmniVisionInfo", OminiVisionInfo, self.callback) rospy.Subscriber("/rival1/omnivision/OmniVisionInfo/GoalInfo", PPoint, self.GoalInfo) rospy.wait_for_service('/rival1/BallHandle') self.call_Handle = rospy.ServiceProxy('/rival1/BallHandle', BallHandle) else : rospy.init_node('coach', anonymous=True) self.state_pub = rospy.Publisher('/coach/state', String, queue_size=1) self.reward_pub = rospy.Publisher('/coach/reward', Float32, queue_size=1) self.done_pub = rospy.Publisher('coach/done', Bool, queue_size=1) self.HowEnd_pub = rospy.Publisher('coach/HowEnd', Int16, queue_size=1) rospy.Subscriber("/nubot1/omnivision/OmniVisionInfo", OminiVisionInfo, self.callback) rospy.Subscriber("/rival1/steal", Bool, self.steal_callback) # steal rospy.Subscriber("/nubot1/A_info", Float32MultiArray, self.A_info_callback) # rospy.Subscriber('gazebo/model_states', ModelStates, self.fly_callback) rospy.Subscriber('nubot1/ready2restart',Bool , self.ready2restart_callback) rospy.wait_for_service('/gazebo/reset_simulation') self.call_restart = rospy.ServiceProxy('/gazebo/reset_simulation', Empty)
def test_dpf_sac(d_path, s_path, threshold=0.02): results = [] for _ in tqdm(range(10)): env = gym.make('ActivePerception-v0') env.sid = 9900 # test dpf = DPF().to(device) dpf.load_model(d_path) sac = SAC(24) sac.load_model(s_path) reward = 0 for episode in tqdm(range(100)): scene_data, obs = env.reset(False) s = get_state(scene_data).to(device) # [1, n_obj, dim_obj] state o = trans_rgb(obs['o']).to(device) # [1, C, H, W] rgb d = trans_d(obs['d']).to(device) # [1, 1, H, W] depth p, w, p_n, x, h = dpf(o, d, n_new=K) mean, var = get_variance(p, w) h_numpy = torch.cat((mean, var), -1).view(-1).detach().cpu().numpy() steps = np.random.choice(7, 7, 0) + 1 for step in steps: #th = np.random.rand()*np.pi*2-np.pi #th = np.pi/4*step th = sac.policy_net.get_action(h_numpy.reshape(1, -1)).item() obs = env.step(th) o = trans_rgb(obs['o']).to(device) d = trans_d(obs['d']).to(device) th = torch.FloatTensor([th]).view(1, -1).to(device) n_new = int(0.7 * K) #int(K*(0.5**(_+1))) p, w, p_n, x, h = dpf(o, d, th, p, w, h, n_new, True) mean, var = get_variance(p, w) h_numpy = torch.cat((mean, var), -1).view(-1).detach().cpu().numpy() p_ = (F.softmax(w, 1).unsqueeze(2).unsqueeze(3) * p).sum(1) mse = F.mse_loss(p_, s).item() d = (mse < threshold) r = 8 if d else -1 reward += r if d: break results.append(reward / 100) results = np.array(results) print("DPF SAC avg reward %10.4f | std %10.4f" % (np.mean(results), np.std(results)))
def get_policy(buffer, model, measure, mode, d_state, d_action, policy_replay_size, policy_batch_size, policy_active_updates, policy_n_hidden, policy_lr, policy_gamma, policy_tau, policy_explore_alpha, policy_exploit_alpha, buffer_reuse, device, verbosity, _log): if verbosity: _log.info("... getting fresh agent") policy_alpha = policy_explore_alpha if mode == 'explore' else policy_exploit_alpha agent = SAC(d_state=d_state, d_action=d_action, replay_size=policy_replay_size, batch_size=policy_batch_size, n_updates=policy_active_updates, n_hidden=policy_n_hidden, gamma=policy_gamma, alpha=policy_alpha, lr=policy_lr, tau=policy_tau) agent = agent.to(device) agent.setup_normalizer(model.normalizer) if not buffer_reuse: return agent if verbosity: _log.info("... transferring exploration buffer") size = len(buffer) for i in range(0, size, 1024): j = min(i + 1024, size) s, a = buffer.states[i:j], buffer.actions[i:j] ns = buffer.states[i:j] + buffer.state_deltas[i:j] s, a, ns = s.to(device), a.to(device), ns.to(device) with torch.no_grad(): mu, var = model.forward_all(s, a) r = measure(s, a, ns, mu, var, model) agent.replay.add(s, a, r, ns) if verbosity: _log.info("... transferred exploration buffer") return agent
def main(args): """ Train and save the SAC model, for the halfcheetah problem :param args: (ArgumentParser) the input arguments """ env = gym.make(args.env) test_env = gym.make(args.env) if args.ent_coef is None: args.ent_coef = 'auto' model = SAC(env=env, test_env=test_env, seed=int(args.seed), ent_coef=args.ent_coef, reward_scale=5.) ep_rewards = model.learn(total_timesteps=int(args.max_timesteps), save_path=args.save_path) model.save(args.save_path + "/%s_model_seed%d_fin_auto.zip" % (args.env, int(args.seed))) np.save( args.save_path + "/%s_rews_seed%d_fin_auto.npy" % (args.env, int(args.seed)), np.array(ep_rewards)) # print("Saving model to halfcheetah_model.zip") # model.learn(total_timesteps=100) # model.load("halfcheetah_model.zip") model.evaluate(10)
class RosAgent: def __init__(self): self.observation_reward = rospy.Subscriber("/rl/environment_response", reward_observation, self.policy, queue_size=10) self.act_pub = rospy.Publisher("/rl/final_action", action_agent, queue_size=10) self.prev_state = None self.state = None self.reward = None self.final_state = None self.agent = SAC() def get_state(self, obs): self.state = obs.observations[:-1] def get_human_act(self, obs): self.human_act = obs.observations[-1] def policy(self, obs_reward): h = std_msgs.msg.Header() h.stamp = rospy.Time.now() self.prev_state = obs_reward.prev_state self.reward = obs_reward.reward self.get_state(obs_reward) self.get_human_act(obs_reward) self.final_state = obs_reward.final_state self.agent.update_rw_state(self.prev_state, self.reward, agent_act, self.state, self.final_state) agent_act = self.agent.next_action(self.state) # exec_time = self.game.play([0.0, agent_act.item()]) # agent_act = self.agent.next_action() # print("reward: %d" %(self.reward)) act = action_agent() act.header = h act.action = [self.human_act, agent_act] self.act_pub.publish(act)
def run(sdk_conn: cozmo.conn): """ Container of the main loop. It is necessary to work with Cozmo. This is called by the cozmo.connect presents in the main loop of this file. :param sdk_conn: SDK connection to Anki Cozmo :type sdk_conn: cozmo.conn :return: nothing :rtype: nothing """ gettrace = getattr(sys, 'gettrace', None) if gettrace is not None and gettrace(): debug = True else: debug = False os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' robot = sdk_conn.wait_for_robot() robot.enable_device_imu(True, True, True) # Turn on image receiving by the camera robot.camera.image_stream_enabled = True # Setting up Hyper-Parameters args, folder, logger, restore = initial_setup() # if not debug: # tb_tool = TensorBoardTool(folder) # tb_tool.run() logger.debug("Initial setup completed.") # Create JSON of Hyper-Parameters for reproducibility with open(folder + "hp.json", 'w') as outfile: json.dump(vars(args), outfile) # Initialize Environment gym_cozmo.initialize(robot, args.img_h, args.img_w) env = gym.make(args.env_name) # Setup the agent agent = SAC(args.state_buffer_size, env.action_space, env, args, folder, logger) agent.train(args.max_num_run, restore) env.close() logger.important("Program closed correctly!")
def test_rnn_sac(r_path, s_path, threshold=0.02): rnn = RNNFilter().to(device) rnn.load_model(r_path) sac = SAC() sac.load_model(s_path) results = [] for _ in tqdm(range(10)): env = gym.make('ActivePerception-v0') env.sid = 9900 # test reward = 0 for episode in range(100): scene_data, obs = env.reset(False) s = get_state(scene_data).to(device) # [1, n_obj, dim_obj] state o = trans_rgb(obs['o']).to(device) # [1, C, H, W] rgb d = trans_d(obs['d']).to(device) # [1, 1, H, W] depth s_, h = rnn(o, d) h_numpy = h.view(-1).detach().cpu().numpy() steps = np.random.choice(7, 7, 0) + 1 #for step in range(7): # n_actions allowed for step in steps: #th = 2*np.pi*np.random.rand()-np.pi th = sac.policy_net.get_action(h_numpy.reshape(1, -1)).item() #th = np.pi/4*step obs = env.step(th) o = trans_rgb(obs['o']).to(device) d = trans_d(obs['d']).to(device) th = torch.FloatTensor([th]).view(1, -1).to(device) s_, h = rnn(o, d, th, h) h_numpy = h.view(-1).detach().cpu().numpy() mse = F.mse_loss(s_, s).item() d = (mse < threshold) r = 8 if d else -1 reward += r if d: break results.append(reward / 100) results = np.array(results) print("RNN SAC avg reward %10.4f | std %10.4f" % (np.mean(results), np.std(results)))
def test(arglist): env_name = arglist.env train_seed = arglist.train_seed test_seed = arglist.test_seed n_episodes = arglist.n_episodes render = arglist.render max_timesteps = 1001 #env = gym.make(env_name) env = gen_envs(arglist) # Set random seed env.seed(test_seed) torch.manual_seed(test_seed) np.random.seed(test_seed) # load pretrained RL models agent = SAC(env.observation_space.shape[0], env.action_space, arglist) agent.load_model(env_name, train_seed) total_reward_list = [] for ep in range(1, n_episodes+1): ep_reward = 0.0 state = env.reset() for t in range(max_timesteps): noise = np.random.normal(0.0, 1.0, size=state.shape) noise = np.clip(noise, -1.0, 1.0) adv_state = state + arglist.noise_scale * noise action = agent.select_action(adv_state, eval=True) state, reward, done, _ = env.step(action) ep_reward += reward if render: env.render() if done: break #print('Episode: {}\tReward: {}'.format(ep, int(ep_reward))) total_reward_list.append(ep_reward) ep_reward = 0.0 env.close() return total_reward_list
def main(args): env = gym.make(args['env_name']) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') action_dim = env.action_space.shape[0] max_action = env.action_space.high[0] state_dim = env.observation_space.shape[0] sac = SAC(args, action_dim, max_action, state_dim, device) summary = tensorboardX.SummaryWriter('./log/{}_sac_{}'.format(args['env_name'], args['noise_type'])) timestep = 0 start_time = time.time() for episode in range(args['max_episode']): episode_reward = 0 state = env.reset() while True: action = sac.get_action(state) next_state, reward, done, info = env.step(action) sac.save(state, action, reward, next_state, int(done)) episode_reward += reward state = next_state timestep += 1 if sac.memory_counter > args['batch_size']: # BATCH_SIZE(64) 이상일 때 부터 train 시작 sac.train() if done: print('episode: ', episode, ' reward : %.3f'%(episode_reward), ' timestep :', timestep) summary.add_scalar('reward/episode', episode_reward, episode) break if episode % args['save_freq'] == 0: if not os.path.exists('./SaveModel') : os.mkdir('./SaveModel') torch.save(sac.actor.state_dict(), './SaveModel/{}_sac_{}_{}'.format(args['env_name'], args['noise_type'], episode))
env = gym.make(args.env_name, reward_type='dense') else: env = gym.make(args.env_name) test_env = gym.make(args.env_name) args.cuda = True if torch.cuda.is_available() else False # Agent if args.gcp: obs_space = env.observation_space['desired_goal'].shape[0] + \ env.observation_space['observation'].shape[0] else: obs_space = env.observation_space.shape[0] args.automatic_entropy_tuning = True agent = SAC(obs_space, env.action_space, args) # Memory memory = ReplayMemory(args.replay_size) # Training Loop total_numsteps = 0 updates = 0 for i_episode in itertools.count(1): episode_reward = 0 episode_steps = 0 done = False state = env.reset() if args.gcp: goal = state['desired_goal']
memory_discount = 0.95 memory_horizon = 1 sac_params = { "linear_output": sac_input, "lr": 0.0003, "target_entropy": -2, "batch_size": 64, "hidden_size": 128 } # Create the controller for the Donkey env env = Car("kari_main", "mqtt.eclipse.org") env.reset() # Create the SAC agent to control the env agent = SAC(parameters=sac_params) # Create the state representation functionality if input("Load model?"): agent = torch.load("model.pth") throttle_weight_1 = 0.1 throttle_weight_2 = -5 STEER_LIMIT_LEFT = -1 STEER_LIMIT_RIGHT = 1 THROTTLE_MAX = 0.23 THROTTLE_MIN = 0.15 MAX_STEERING_DIFF = 0.5 action_space = spaces.Box(low=np.array([STEER_LIMIT_LEFT, THROTTLE_MIN]),
action="store_true", help='run on CUDA (default: False)') args = parser.parse_args() # Environment # env = NormalizedActions(gym.make(args.env_name)) env = gym.make(args.env_name) env.seed(args.seed) # env.action_space.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) # Agent agent = SAC(env.observation_space.shape[0] + 3, env.action_space, args) # Memory memory = ReplayGMemory(args.replay_size, args.seed) # Training Loop total_numsteps = 0 updates = 0 did_it = False for i_episode in itertools.count(1): episode_reward = 0 episode_steps = 0 done = False episode = [] state = env.reset() worst_goals = np.array(
parser.add_argument('--hidden_size', type=int, default=256, metavar='N', help='hidden size (default: 256)') parser.add_argument('--updates_per_step', type=int, default=1, metavar='N', help='model updates per simulator step (default: 1)') parser.add_argument('--start_steps', type=int, default=1000, metavar='N', help='Steps sampling random actions (default: 10000)') parser.add_argument('--target_update_interval', type=int, default=1, metavar='N', help='Value target update per no. of updates per step (default: 1)') parser.add_argument('--replay_size', type=int, default=1000000, metavar='N', help='size of replay buffer (default: 10000000)') parser.add_argument('--cuda', action="store_true",default=True, help='run on CUDA (default: False)') args = parser.parse_args() env = ReachAvoidAgent() agent = SAC(env.observation_space.shape[0], env.action_space, args) agent.load_model("models/pretrain","models/sac_critic_ra_3.0") # agent.load_model("models/sac_actor_ra_11.0","models/sac_critic_ra_11.0") def compute_done(a_state,d_state,times): distance = np.sqrt(np.sum(np.square(a_state[:2] - d_state[:2]))) # print(distance) if (distance<0.2 or a_state[0]>1): done_n = True else: done_n = False if(times > 400): done_n = True if (a_state[1]>1 or a_state[1]<-1 or a_state[0]<-1.5): done_n = True return done_n
def run_session(db_name, max_session_length, sweep, session, model_name, params): alg = SAC(params) car = Car() car.reset() training_after_episodes = params["training_after_episodes"] episode = 0 random_episodes = params["random_episodes"] max_episode_length = params["max_episode_length"] THROTTLE_MAX = params["throttle_max"] THROTTLE_MIN = params["throttle_min"] STEER_LIMIT_LEFT = -1 STEER_LIMIT_RIGHT = 1 action_space = spaces.Box(low=np.array([STEER_LIMIT_LEFT, -1]), high=np.array([STEER_LIMIT_RIGHT, 1]), dtype=np.float32) for i in range(max_session_length): episode += 1 throttle = 0.15 try: step = 0 state = car.reset() time.sleep(1) state = car.step([0, 0.01]) #print(state) state = alg.process_image(state) state = np.stack((state, state, state, state), axis=0) episode_buffer = EpisodeBuffer(alg.horizon, alg.discount) episode_reward = 0 while step < max_episode_length: t = time.time_ns() step += 1 temp = state[np.newaxis, :] if episode < random_episodes: action = action_space.sample() else: action = alg.select_action(temp) #action[1] = max(THROTTLE_MIN, min(THROTTLE_MAX, action[1])) action[0] = max(STEER_LIMIT_LEFT, min(STEER_LIMIT_RIGHT, action[0])) throttle += action[1] / 100.0 throttle = max(THROTTLE_MIN, min(THROTTLE_MAX, throttle)) action[1] = throttle action[1] = 0.3 next_state = car.step(action) im = next_state darkness = len(im[(im > 120) * (im < 130)]) if darkness < 2500: # < len(im[(im > 160) * (im < 170)]): raise KeyboardInterrupt next_state = alg.process_image(next_state) reward = (throttle - THROTTLE_MIN) / (THROTTLE_MAX - THROTTLE_MIN) reward = darkness / 7000 image_to_ascii(next_state[::2].T) episode_reward += reward print( "Sweep: {}, Episode: {}, Step: {}, Episode reward: {:.2f}, Step reward: {:.2f}" .format(sweep, episode, step, episode_reward, reward)) not_done = 1.0 next_state = next_state[np.newaxis, :] next_state = np.vstack((state[:3, :, :], next_state)) out = episode_buffer.add( [state, action, [reward], next_state, [not_done]]) last = [state, action, [reward], next_state, [not_done]] alg.push_buffer(last) #if out: #alg.push_buffer(out) state = next_state if len(alg.replay_buffer) > alg.batch_size: alg.update_parameters() tn = time.time_ns() #sync with the network time.sleep(max(0, 0.1 - (tn - t) / 1e9)) raise KeyboardInterrupt except KeyboardInterrupt: last[4] = [0] alg.push_buffer(last) car.reset() #if episode % 5 == 0: #print("Saving chekcpoint") #torch.save(alg, "sac_model_checkpoint.pth") print("Calculating reward") # episode_buffer = episode_buffer.as_list() # for i in range(len(episode_buffer)): # reward = 0 # for j in range(min(len(episode_buffer) - i, alg.horizon)): # reward += alg.discount**j * episode_buffer[i + j][2][0] # norm = (1 - alg.discount**alg.horizon) / (1 - alg.discount) # e = episode_buffer[i] # e[2] = [reward / norm] # if i == len(episode_buffer) - 1: # e[-1][0] = 0.0 # alg.push_buffer(e) if len(alg.replay_buffer) > alg.batch_size: print("Training") for i in range(training_after_episodes): alg.update_parameters() db.insert_episode(db_name, session, episode, step, episode_reward) time.sleep(5)
class Strategy(object): def __init__(self, team): self.sac_cal = sac_calculate() self.a = [] self.s = [] self.r = 0.0 self.done = False self.avg_arr = np.zeros(64) self.team = team self.RadHead2Ball = 0.0 self.RadHead = 0.0 self.NunbotAPosX = 0.0 self.NunbotAPosY = 0.0 self.NunbotBPosX = 0.0 self.BallPosX = 0.0 self.BallPosY = 0.0 self.GoalX = 900.0 self.GoalY = 0.0 self.StartX = -900.0 self.StartY = 0.0 self.kick_count = 0 self.kick_num = 0 self.score = 0 self.RadTurn = 0.0 self.Steal = False self.dis2start = 0.0 self.dis2goal = 0.0 self.vec = VelCmd() self.A_info = np.array([1.0, 1.0, 1.0, 1.0, 0, 0, 0, 0]) self.game_count = 2 self.A_z = 0.0 self.B_z = 0.0 self.HowEnd = 0 self.B_dis = 0.0 self.ep_rwd = 0 self.is_kick = False self.ready2restart = True self.list_rate = list(np.zeros(128)) self.milestone = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] self.step_milestone = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] self.milestone_idx = 0 self.is_in = False self.is_out = False self.is_steal = False self.is_fly = False self.is_stealorfly = False self.real_resart = True self.step_count = 0 self.LeftAng = 0 self.Dis2Ball = 0 def callback(self, data): # Rostopic 之從外部得到的值 self.RadHead2Ball = data.ballinfo.real_pos.angle self.Dis2Ball = data.ballinfo.real_pos.radius self.RadHead = data.robotinfo[0].heading.theta self.BallPosX = data.ballinfo.pos.x self.BallPosY = data.ballinfo.pos.y self.NunbotAPosX = data.robotinfo[0].pos.x self.NunbotAPosY = data.robotinfo[0].pos.y self.NunbotBPosX = data.obstacleinfo.pos[0].x self.B_dis = data.obstacleinfo.polar_pos[0].radius def steal_callback(self, data): self.Steal = data.data def A_info_callback(self, data): self.A_info = np.array(data.data) self.is_kick = True def GoalInfo(self, data): self.LeftAng = data.left_angle def state_callback(self, data): self.kick_count = 0 def reward_callback(self, data): pass # self.r = data.data def done_callback(self, data): self.done = data.data def fly_callback(self, data): self.A_z = data.pose[5].position.z self.B_z = data.pose[6].position.z def HowEnd_callback(self, data): self.HowEnd = data.data def ready2restart_callback(self, data): self.restart() self.ready2restart = False def ros_init(self): if self.team == 'A': self.agent = SAC(act_dim=2, obs_dim=7, lr_actor=l_rate * (1e-3), lr_value=l_rate * (1e-3), gamma=0.99, tau=0.995) rospy.init_node('strategy_node_A', anonymous=True) # self.A_info_pub = rospy.Publisher('/nubot1/A_info', Float32MultiArray, queue_size=1) # 3in1 self.vel_pub = rospy.Publisher('/nubot1/nubotcontrol/velcmd', VelCmd, queue_size=1) self.reset_pub = rospy.Publisher('/gazebo/set_model_state', ModelState, queue_size=10) # self.ready2restart_pub = rospy.Publisher('nubot1/ready2restart',Bool, queue_size=1) rospy.Subscriber("/nubot1/omnivision/OmniVisionInfo", OminiVisionInfo, self.callback) rospy.Subscriber('gazebo/model_states', ModelStates, self.fly_callback) # rospy.Subscriber('/coach/state', String, self.state_callback) # rospy.Subscriber('/coach/reward', Float32, self.reward_callback) # rospy.Subscriber('/coach/done', Bool, self.done_callback) # rospy.Subscriber('coach/HowEnd', Int16, self.HowEnd_callback) # rospy.Subscriber("/rival1/steal", Bool, self.steal_callback) rospy.wait_for_service('/nubot1/Shoot') self.call_Shoot = rospy.ServiceProxy('/nubot1/Shoot', Shoot) # rospy.wait_for_service('/gazebo/reset_simulation') # self.call_restart = rospy.ServiceProxy('/gazebo/reset_simulation', Empty, persistent=True) # rospy.wait_for_service('/gazebo/set_model_state') # self.call_set_modol = rospy.ServiceProxy('/gazebo/set_model_state', SetModelState) rospy.wait_for_service('/nubot1/BallHandle') self.call_Handle = rospy.ServiceProxy('/nubot1/BallHandle', BallHandle) rospy.wait_for_service('/rival1/BallHandle') self.call_B_Handle = rospy.ServiceProxy('/rival1/BallHandle', BallHandle) elif self.team == 'B': rospy.init_node('strategy_node_B', anonymous=True) self.vel_pub = rospy.Publisher('/rival1/nubotcontrol/velcmd', VelCmd, queue_size=1) self.steal_pub = rospy.Publisher('/rival1/steal', Bool, queue_size=1) # steal rospy.Subscriber("/rival1/omnivision/OmniVisionInfo", OminiVisionInfo, self.callback) rospy.Subscriber("/rival1/omnivision/OmniVisionInfo/GoalInfo", PPoint, self.GoalInfo) rospy.wait_for_service('/rival1/BallHandle') self.call_Handle = rospy.ServiceProxy('/rival1/BallHandle', BallHandle) else: rospy.init_node('coach', anonymous=True) self.state_pub = rospy.Publisher('/coach/state', String, queue_size=1) self.reward_pub = rospy.Publisher('/coach/reward', Float32, queue_size=1) self.done_pub = rospy.Publisher('coach/done', Bool, queue_size=1) self.HowEnd_pub = rospy.Publisher('coach/HowEnd', Int16, queue_size=1) rospy.Subscriber("/nubot1/omnivision/OmniVisionInfo", OminiVisionInfo, self.callback) rospy.Subscriber("/rival1/steal", Bool, self.steal_callback) # steal rospy.Subscriber("/nubot1/A_info", Float32MultiArray, self.A_info_callback) # rospy.Subscriber('gazebo/model_states', ModelStates, self.fly_callback) rospy.Subscriber('nubot1/ready2restart', Bool, self.ready2restart_callback) rospy.wait_for_service('/gazebo/reset_simulation') self.call_restart = rospy.ServiceProxy('/gazebo/reset_simulation', Empty) def ball_out(self): if self.BallPosX >= 875 or self.BallPosX <= -875 or self.BallPosY >= 590 or self.BallPosY <= -590: self.show('Out') self.is_out = True def ball_in(self): if self.BallPosX >= 870 and self.BallPosX <= 900 and self.BallPosY >= -100 and self.BallPosY <= 100: self.show('in') self.is_in = True def fly(self): if self.A_z > 0.30 or self.B_z > 0.30: self.is_fly = True def steal(self): rospy.wait_for_service('/nubot1/BallHandle') rospy.wait_for_service('/rival1/BallHandle') if self.call_B_Handle( 1).BallIsHolding and not self.call_Handle(1).BallIsHolding: self.is_steal = True def stealorfly(self): if self.is_fly or self.is_steal: self.is_stealorfly = True def show(self, state): global _state if state != _state: print(state) _state = state def cnt_rwd(self): data = Float32MultiArray() data.data = [ self.kick_count, self.cal_dis2start(), self.cal_dis2goal(), self.B_dis, 0, 0, 0, 0 ] if self.game_is_done(): if self.HowEnd == 1: data.data[4] = 1 data.data[5] = 0 data.data[6] = 0 elif self.HowEnd == -1: data.data[4] = 0 data.data[5] = 0 data.data[6] = 1 elif self.HowEnd == -2: data.data[4] = 0 data.data[5] = 1 data.data[6] = 0 # self.A_info_pub.publish(data) # 2C self.sac_cal = sac_calculate() reward = self.sac_cal.reward(data.data) data.data[7] = reward print('rwd init', ['kck_n g_dis st_dis opp_dis in out steal ttl']) print('rwd unit', np.around((data.data), decimals=1)) print('rwd :', reward) return (reward) def kick(self): global MaxSpd_A self.vec.Vx = MaxSpd_A * math.cos(self.RadHead2Ball) self.vec.Vy = MaxSpd_A * math.sin(self.RadHead2Ball) # self.vec.w = self.RadHead2Ball * RotConst self.vec.w = 0 self.vel_pub.publish(self.vec) global pwr # rospy.wait_for_service('/nubot1/Shoot') self.call_Shoot(pwr, 1) # power from GAFuzzy while 1: self.chase(MaxSpd_A) if self.game_is_done() and self.real_resart: break if not self.call_Handle(1).BallIsHolding: self.kick_count = self.kick_count + 1 # time.sleep(0.2) print("Kick: %d" % self.kick_count) print('-------') break print('in') def chase(self, MaxSpd): self.vec.Vx = MaxSpd * math.cos(self.RadHead2Ball) self.vec.Vy = MaxSpd * math.sin(self.RadHead2Ball) self.vec.w = self.RadHead2Ball * RotConst self.vel_pub.publish(self.vec) def chase_B(self, MaxSpd): self.vec.Vx = MaxSpd * math.cos(self.RadHead2Ball) self.vec.Vy = MaxSpd * math.sin(self.RadHead2Ball) self.vec.w = self.RadHead2Ball * RotConst / 4 ####### self.vel_pub.publish(self.vec) def turn(self, angle): global MaxSpd_A self.vec.Vx = MaxSpd_A * math.cos(self.RadHead2Ball) self.vec.Vy = MaxSpd_A * math.sin(self.RadHead2Ball) # self.vec.Vx = 0 # self.vec.Vy = 0 self.vec.w = turnHead2Kick(self.RadHead, angle) * RotConst self.vel_pub.publish(self.vec) self.show("Turning") def cal_dis2start(self): # last kick dis2start_x = self.NunbotAPosX - self.StartX dis2start_y = self.NunbotAPosY - self.StartY dis2start = math.hypot(dis2start_x, dis2start_y) return dis2start # self.dis2start_pub.publish(dis2start) def cal_dis2goal(self): # last kick dis2goal_x = self.NunbotAPosX - self.GoalX dis2goal_y = self.NunbotAPosY - self.GoalY dis2goal = math.hypot(dis2goal_x, dis2goal_y) return dis2goal # self.dis2goal_pub.publish(dis2goal) def avg(self, n, l): l = np.delete(l, 0) l = np.append(l, n) self.avg_arr = l print(self.avg_arr) print(sum(l) / 64) def reset_ball(self): while self.BallPosX != -680: Ball_msg = ModelState() Ball_msg.model_name = 'football' Ball_msg.pose.position.x = -6.8 #-6 #-6.8 Ball_msg.pose.position.y = random.uniform(-3.3, 3.3) # Ball_msg.pose.position.y = 0 Ball_msg.pose.position.z = 0.12 Ball_msg.pose.orientation.x = 0 Ball_msg.pose.orientation.y = 0 Ball_msg.pose.orientation.z = 0 Ball_msg.pose.orientation.w = 1 self.reset_pub.publish(Ball_msg) def reset_A(self): while self.NunbotAPosX != -830: A_msg = ModelState() A_msg.model_name = 'nubot1' A_msg.pose.position.x = -8.3 #-8 #-8.5 A_msg.pose.position.y = random.uniform(-1.7, 1.7) #0 # A_msg.pose.position.y = 0 A_msg.pose.position.z = 0 A_msg.pose.orientation.x = 0 A_msg.pose.orientation.y = 0 A_msg.pose.orientation.z = 0 A_msg.pose.orientation.w = 1 self.reset_pub.publish(A_msg) def reset_B(self): while self.NunbotBPosX != 0: B_msg = ModelState() B_msg.model_name = 'rival1' # a = [[2,0],[1, -2],[1, 2],[0,4],[0,-4]] # b = a[random.randint(0, 4)] c = random.uniform(-5, 5) B_msg.pose.position.x = 0 # B_msg.pose.position.y = 0 B_msg.pose.position.y = c B_msg.pose.position.z = 0 B_msg.pose.orientation.x = 0 B_msg.pose.orientation.y = 0 B_msg.pose.orientation.z = 0 B_msg.pose.orientation.w = 1 self.reset_pub.publish(B_msg) def restart(self): # game_state_word = "game is over" # self.state_pub.publish(game_state_word) # 2A # self.Steal = False self.reset_ball() print('Game %d over' % (self.game_count - 1)) print('-----------Restart-----------') print('Game %d start' % self.game_count) self.reset_A() self.game_count += 1 self.kick_count = 0 self.reset_B() # self.call_set_modol(SetModelState) # print('after call_restart') self.ready2restart = False self.is_fly = False self.is_steal = False self.is_stealorfly = False self.is_in = False self.is_out = False # def save(self, agent, in_rate): # print('save file') # if os.path.isdir(agent.path+str(in_rate)): # shutil.rmtree(agent.path+str(in_rate)) # os.mkdir(agent.path+str(in_rate)) # ckpt_path = os.path.join(agent.path+str(in_rate), 'SAC.ckpt') # save_path = agent.saver.save(agent.sess, ckpt_path, write_meta_graph=False) # print("\nSave Model %s\n" % save_path) def save(self, agent, in_rate): print('save file') if os.path.isdir(agent.path): shutil.rmtree(agent.path) os.mkdir(agent.path) ckpt_path = os.path.join(agent.path, 'SAC.ckpt') save_path = agent.saver.save(agent.sess, ckpt_path, write_meta_graph=False) print("\nSave Model %s\n" % save_path) def end_rate(self, end): self.list_rate[self.game_count % 128] = end out_count = self.list_rate.count(-2) in_count = self.list_rate.count(1) steal_count = self.list_rate.count(-1) in_rate = in_count / 128 print('in_rate', in_count / 128, 'out_rate', out_count / 128, 'steal_rate', steal_count / 128) if in_count / 128 != 0 and self.milestone_idx == 0: self.milestone[0] = self.game_count self.step_milestone[0] = self.step_count self.milestone_idx = self.milestone_idx + 1 self.save(self.agent, in_rate) if in_count / 128 >= 0.1 and self.milestone_idx == 1: self.milestone[1] = self.game_count self.step_milestone[1] = self.step_count self.milestone_idx = self.milestone_idx + 1 self.save(self.agent, in_rate) if in_count / 128 >= 0.2 and self.milestone_idx == 2: self.milestone[2] = self.game_count self.step_milestone[2] = self.step_count self.milestone_idx = self.milestone_idx + 1 self.save(self.agent, in_rate) if in_count / 128 >= 0.3 and self.milestone_idx == 3: self.milestone[3] = self.game_count self.step_milestone[3] = self.step_count self.milestone_idx = self.milestone_idx + 1 self.save(self.agent, in_rate) if in_count / 128 >= 0.4 and self.milestone_idx == 4: self.milestone[4] = self.game_count self.step_milestone[4] = self.step_count self.milestone_idx = self.milestone_idx + 1 self.save(self.agent, in_rate) if in_count / 128 >= 0.5 and self.milestone_idx == 5: self.milestone[5] = self.game_count self.step_milestone[5] = self.step_count self.milestone_idx = self.milestone_idx + 1 self.save(self.agent, in_rate) if in_count / 128 >= 0.6 and self.milestone_idx == 6: self.milestone[6] = self.game_count self.step_milestone[6] = self.step_count self.milestone_idx = self.milestone_idx + 1 self.save(self.agent, in_rate) if in_count / 128 >= 0.7 and self.milestone_idx == 7: self.milestone[7] = self.game_count self.step_milestone[7] = self.step_count self.milestone_idx = self.milestone_idx + 1 self.save(self.agent, in_rate) if in_count / 128 >= 0.8 and self.milestone_idx == 8: self.milestone[8] = self.game_count self.step_milestone[8] = self.step_count self.milestone_idx = self.milestone_idx + 1 self.save(self.agent, in_rate) if in_count / 128 >= 0.9 and self.milestone_idx == 9: self.milestone[9] = self.game_count self.step_milestone[9] = self.step_count self.milestone_idx = self.milestone_idx + 1 self.save(self.agent, in_rate) if in_count / 128 >= 1.0 and self.milestone_idx == 10: self.milestone[10] = self.game_count self.step_milestone[10] = self.step_count self.milestone_idx = self.milestone_idx + 1 self.save(self.agent, in_rate) print('milestone', self.milestone) print('milestone', self.step_milestone) def game_is_done(self): self.ball_in() self.ball_out() self.steal() self.fly() self.stealorfly() if self.is_in or self.is_out or self.is_stealorfly: if self.is_in: self.HowEnd = 1 elif self.is_out: self.HowEnd = -2 elif self.is_stealorfly: self.HowEnd = -1 else: print('err') return True else: return False def workA(self): np.set_printoptions(suppress=True) i = 0 fisrt_time_hold = False while not rospy.is_shutdown(): # while self.step_count < 30000: while 1: # print(self.ball_in(), self.ball_out(), self.stealorfly()) rospy.wait_for_service('/nubot1/BallHandle') self.call_Handle(1) # open holding device if self.game_is_done() and self.real_resart: # print('self.game_is_done()',self.game_is_done()) self.r = self.cnt_rwd() # print('h',self.HowEnd) s_ = self.sac_cal.input(self.HowEnd) #out state if i > 1: if len(self.s) == 7 and len(s_) == 7: # print('000000000000000000',np.shape(self.s), np.shape(self.a)) self.agent.replay_buffer.store_transition( self.s, self.a, self.r, s_, self.done) # print('d',self.done) self.ep_rwd = self.r print('ep rwd value=', self.r) self.end_rate(self.HowEnd) self.step_count = self.step_count + 1 i += 1 self.s = s_ self.done = False if i > 512: # 64 self.agent.learn(i, self.r, self.ep_rwd) # self.ready2restart_pub.publish(True) # self.ready2restart_pub.publish(False) self.real_resart = False self.HowEnd = 0 # print('i want to go in self.restart()') self.restart() # self.end_rate(self.HowEnd) # print('---') # elif not self.game_is_done(): else: # print('self.game_is_done()',self.game_is_done()) rospy.wait_for_service('/nubot1/BallHandle') if not self.call_Handle( 1).BallIsHolding: # BallIsHolding = 0 self.chase(MaxSpd_A) fisrt_time_hold = True rospy.wait_for_service('/nubot1/BallHandle') # do real reset before holding self.ready2restart = False self.is_fly = False self.is_steal = False self.is_stealorfly = False self.is_in = False self.is_out = False self.real_resart = True # elif self.call_Handle( 1).BallIsHolding: # BallIsHolding = 1 global RadHead self.chase(MaxSpd_A) if fisrt_time_hold == True: self.real_resart = True # self.chase(MaxSpd_A) self.show('Touch') self.r = self.cnt_rwd() s_ = self.sac_cal.input(0) #state_for_sac if i >= 1: if len(self.s) == 7 and len(s_) == 7: self.agent.replay_buffer.store_transition( self.s, self.a, self.r, s_, self.done) print('step rwd value= ', self.r) self.step_count = self.step_count + 1 self.done = False i += 1 self.s = s_ self.a = self.agent.choose_action( self.s) #action_from_sac rel_turn_ang = self.sac_cal.output( self.a) #action_from_sac global pwr, MAX_PWR pwr = (self.a[1] + 1) * MAX_PWR / 2 + 1.4 #normalize # sac] rel_turn_rad = math.radians(rel_turn_ang) self.RadTurn = rel_turn_rad + self.RadHead fisrt_time_hold = False if i > 512: self.agent.learn(i, self.r, self.ep_rwd) elif fisrt_time_hold == False: self.chase(MaxSpd_A) error = math.fabs( turnHead2Kick(self.RadHead, self.RadTurn)) if error > angle_thres: # 還沒轉到 self.turn(self.RadTurn) else: # 轉到 self.kick() def ClassicRounding(self, goal_ang, ball_dis, ball_rad, MaxSpd): # alpha = math.radians(ball_ang - goal_ang) ball_ang = math.degrees(ball_rad) alpha = math.radians(ball_ang - goal_ang) if ball_dis > 120 * 3: beta = 0.7 else: beta = 1 if abs(alpha) > beta: alpha = beta * np.sign(alpha) else: pass v_yaw = goal_ang br_x = MaxSpd * math.cos(math.radians(ball_ang)) br_y = MaxSpd * math.sin(math.radians(ball_ang)) v_x = br_x * math.cos(alpha) - br_y * math.sin(alpha) v_y = br_x * math.sin(alpha) + br_y * math.cos(alpha) self.vec.Vx = v_x self.vec.Vy = v_y # print(goal_ang) self.vec.w = math.radians(v_yaw) * 0.25 # self.vec.w = self.RadHead2Ball * RotConst/4####### self.vel_pub.publish(self.vec) return v_x, v_y, v_yaw def workB(self): # catch = False while not rospy.is_shutdown(): rospy.wait_for_service('/rival1/BallHandle') # self.call_Handle(1) #start holding device if not self.call_Handle(1).BallIsHolding: # BallIsHolding = 0 self.steal_pub.publish(False) self.chase_B(MaxSpd_B) # chase ball strategy # self.ClassicRounding(self.LeftAng, self.Dis2Ball, self.RadHead2Ball,MaxSpd_B) # catch = False else: # BallIsHolding = 1 # self.chase(MaxSpd_B/4) # if not catch: # catch = True # ticks = time.time() # ticks = ticks + 1 # sec # steal time # if time.time() > ticks: self.steal_pub.publish(True) # 2C # self.show('steal') # ticks += 5 def workC(self): print('Game 1 start') # rate = rospy.Rate(10) np.set_printoptions(suppress=True) while not rospy.is_shutdown(): is_stealorfly = self.stealorfly() is_out = self.ball_out() is_in = self.ball_in() # # [] send rwd 2 A # self.sac_cal = sac_calculate() # # self.A_info = list(self.A_info) # # if self.is_kick: # self.A_info[4] = 0 # self.A_info[5] = 0 # self.A_info[6] = 0 # reward = self.sac_cal.reward(self.A_info) # rwd 2 A # self.reward_pub.publish(reward) # print('step rwd unit = ', np.around((self.A_info), decimals=1 )) # 7in1 # 7 rwd unit # print('step rwd value =',reward) # # self.is_kick = False if is_in or is_out or is_stealorfly: # done 2 A # self.ready2restart = False if is_in: HowEnd = 1 # self.A_info[4] = 1 # self.A_info[5] = 0 # self.A_info[6] = 0 if is_stealorfly: HowEnd = -1 # self.A_info[4] = 0 # self.A_info[5] = 0 # self.A_info[6] = 1 if is_out: HowEnd = -2 # self.A_info[4] = 0 # self.A_info[5] = 1 # self.A_info[6] = 0 self.HowEnd_pub.publish(HowEnd) self.done_pub.publish(True) if self.ready2restart: self.restart()
# Environment from fetch_env import fetch_env env = UnityEnvironment(fetch_env(args.env, args.system)) default_brain = env.brain_names[0] brain = env.brains[default_brain] torch.manual_seed(args.seed) np.random.seed(args.seed) # Agent num_worker = 11 state_dim = 1060 high = np.ones(39) action_dim = spaces.Box(-high, high, dtype=np.float32) agent = SAC(state_dim, action_dim, args) #TesnorboardX writer = SummaryWriter(logdir='runs/{}_SAC_{}_{}_{}'.format( datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env, args.policy, "autotune" if args.automatic_entropy_tuning else "")) # Training Loop total_numsteps = 0 agent_reward = np.zeros(num_worker) buffer_reward = np.zeros(num_worker) done = False env_info = env.reset(train_mode=True)[default_brain] states = env_info.vector_observations
import gym from sac import SAC from utils.sac_runner import vector_train from utils.sac_runner import evaluate if __name__ == "__main__": env = gym.vector.make("Pendulum-v0", num_envs=4, asynchronous=True) actor = SAC(env.single_observation_space, env.single_action_space, p_lr=1e-3, q_lr=1e-3) returns = vector_train(actor, env, 40000, -200) eval_env = gym.make("Pendulum-v0") evaluate(actor, eval_env, 1, True)
help='Value target update per no. of updates per step (default: 1)') parser.add_argument('--replay_size', type=int, default=1000000, metavar='N', help='size of replay buffer (default: 10000000)') parser.add_argument('--cuda', type=bool, default=True, help='run on CUDA (default: False)') args = parser.parse_args() device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') env = gym.make(args.env_name) env.seed(args.seed) env.action_space.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) # Agent agent = SAC(env.observation_space.shape[0], env.action_space, args) #Tesnorboard writer = SummaryWriter('runs/{}_SAC_{}_{}_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name, args.policy, "autotune" if args.automatic_entropy_tuning else "")) if not os.path.exists('models/'): os.makedirs('models/') save_dir = 'models/{}_SAC_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name) os.system('mkdir {}'.format(save_dir)) # Memory memory = ReplayMemory(args.replay_size, args.seed) # Training Loop total_numsteps = 0 updates = 0
env.get_observation_space_size = new_get_observation_space_size env.observation_space = ([0] * env.get_observation_space_size(), [0] * env.get_observation_space_size()) env.observation_space = convert_to_gym(env.observation_space) # Create log dir for callback model saving os.makedirs("./temp_models/", exist_ok=True) env = Monitor(env, "./temp_models/", allow_early_resets=True) ##### TRAIN ##### if args.train: check_overwrite(args.model) model = SAC(MlpPolicy, env, verbose=1, tensorboard_log="./tensorboard_log/") model.learn(total_timesteps=int(args.step), log_interval=10, tb_log_name="log", callback=callback.callback) model.save(MODELS_FOLDER_PATH) #### TEST ##### if not args.train: model = SAC.load(MODELS_FOLDER_PATH) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, done, info = env.step(scale_range(action, -1, 1, 0, 1))
targetline = mLines[-1] current_episode = targetline.split(',')[1] # Agent agent_args = { "gamma": gamma, "tau": tau, "alpha": alpha, "policy": policy, "target_update_interval": target_update_interval, "automatic_entropy_tuning": automatic_entropy_tuning, "cuda": cuda, "hidden_size": hidden_size, "lr": lr } # print("env.action_space",env.action_space) agent = SAC(env.observation_space.shape[0], env.action_space, agent_args) """ #Loading model from last training if it exists #TODO filename = "PPO_continuous_" +env_name+"_"+"_"+str(rewardFunctionVersion)+"_"+str(nnVersion)+".pth" directory = "./" fileExists = os.path.isfile(directory+filename) print("fileExists:::::",fileExists) if fileExists: agent.load_model(actor_path, critic_path) """ #TesnorboardX writer = SummaryWriter(logdir='runs/{}_SAC_{}_{}_{}'.format( datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), env_name, policy, "autotune" if automatic_entropy_tuning else "")) # Memory
parser.add_argument('--updates_per_step', type=int, default=1, metavar='N', help='model updates per simulator step (default: 1)') parser.add_argument('--target_update_interval', type=int, default=1, metavar='N', help='Value target update per no. of updates per step (default: 1)') parser.add_argument('--replay_size', type=int, default=1000000, metavar='N', help='size of replay buffer (default: 10000000)') args = parser.parse_args() # Environment env = NormalizedActions(gym.make(args.env_name)) env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) # Agent agent = SAC(env.observation_space.shape[0], env.action_space, args) writer = SummaryWriter() # Memory memory = ReplayMemory(args.replay_size) # Training Loop rewards = [] total_numsteps = 0 updates = 0 for i_episode in itertools.count(): state = env.reset() episode_reward = 0
action="store_true", help='run on CUDA (default: False)') args = parser.parse_args() # Environment # env = NormalizedActions(gym.make(args.env_name)) env = gym.make(args.env_name) torch.manual_seed(args.seed) np.random.seed(args.seed) env.seed(args.seed) # Agent print("modular:", args.modular) agent_1 = SAC(env.observation_space.shape[0], env.action_space, args) if args.modular: agent_2 = SAC(env.observation_space.shape[0], env.action_space, args) #TesnorboardX writer_datetime = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") writer_1 = SummaryWriter(logdir='runs/{}_SAC_1_{}_{}_{}'.format( writer_datetime, args.env_name, args.policy, "autotune" if args.automatic_entropy_tuning else "")) if args.modular: writer_2 = SummaryWriter(logdir='runs/{}_SAC_2_{}_{}_{}'.format( writer_datetime, args.env_name, args.policy, "autotune" if args.automatic_entropy_tuning else ""))
np.random.seed(args.seed) env.seed(args.seed) # processing log files if not os.path.exists('./logs'): os.makedirs('./logs') log_dirname = './logs/{}'.format(args.env) if not os.path.exists(log_dirname): os.makedirs(log_dirname) log_filename = './logs/{}/{}_{}_{}.log'.format(args.env, args.policy, args.env, args.seed) log_fd = open(log_filename, 'w+') # Agent if not args.use_adv: agent = SAC(env.observation_space.shape[0], env.action_space, args) else: agent = ADVSAC(env.observation_space.shape[0], env.action_space, args) #TesnorboardX writer = tf.summary.FileWriter('tensorboard/{}_{}_{}'.format( args.policy, args.env, args.seed)) # Memory memory = ReplayMemory(args.replay_size) # Training Loop total_numsteps = int(-args.start_steps) updates = 0 for i_episode in itertools.count(1):
parser.add_argument('--cuda', action="store_true", help='run on CUDA (default: False)') args = parser.parse_args() # Environment # env = NormalizedActions(gym.make(args.env_name)) env = gym.make(args.env_name) env.seed(args.seed) env.action_space.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) # Agent agent = SAC(env.observation_space.shape[0], env.action_space, args) #Tesnorboard path = 'runs/{}_SAC_{}_{}_{}_seed_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name, args.policy, "autotune" if args.automatic_entropy_tuning else "", args.seed) writer = SummaryWriter(path) # Memory memory = ReplayMemory(args.replay_size, args.seed) # Training Loop total_numsteps = 0 updates = 0 for i_episode in itertools.count(0):
def train_SAC(env_name, exp_name, n_iter, ep_len, seed, logdir, alpha, prefill_steps, discount, batch_size, learning_rate, tau, two_qf): alpha = { 'Ant-v2': 0.1, 'HalfCheetah-v2': 0.2, 'Hopper-v2': 0.2, 'Humanoid-v2': 0.05, 'Walker2d-v2': 0.2, }.get(env_name, alpha) algorithm_params = { 'alpha': alpha, 'batch_size': batch_size, 'discount': discount, 'learning_rate': learning_rate, 'reparameterize': True, 'tau': tau, 'epoch_length': ep_len, 'n_epochs': n_iter, 'two_qf': two_qf, } sampler_params = { 'max_episode_length': 1000, 'prefill_steps': prefill_steps, } replay_pool_params = { 'max_size': 1e6, } value_function_params = { 'hidden_layer_sizes': (64, 64), } q_function_params = { 'hidden_layer_sizes': (64, 64), } policy_params = { 'hidden_layer_sizes': (64, 64), } logz.configure_output_dir(logdir) params = { 'exp_name': exp_name, 'env_name': env_name, 'algorithm_params': algorithm_params, 'sampler_params': sampler_params, 'replay_pool_params': replay_pool_params, 'value_function_params': value_function_params, 'q_function_params': q_function_params, 'policy_params': policy_params } logz.save_params(params) env = gym.envs.make(env_name) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) sampler = utils.SimpleSampler(**sampler_params) replay_pool = utils.SimpleReplayPool( observation_shape=env.observation_space.shape, action_shape=env.action_space.shape, **replay_pool_params) q_function = nn.QFunction(name='q_function', **q_function_params) if algorithm_params.get('two_qf', False): q_function2 = nn.QFunction(name='q_function2', **q_function_params) else: q_function2 = None value_function = nn.ValueFunction( name='value_function', **value_function_params) target_value_function = nn.ValueFunction( name='target_value_function', **value_function_params) policy = nn.GaussianPolicy( action_dim=env.action_space.shape[0], reparameterize=algorithm_params['reparameterize'], **policy_params) sampler.initialize(env, policy, replay_pool) algorithm = SAC(**algorithm_params) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) tf_config.gpu_options.allow_growth = True # may need if using GPU with tf.Session(config=tf_config): algorithm.build( env=env, policy=policy, q_function=q_function, q_function2=q_function2, value_function=value_function, target_value_function=target_value_function) for epoch in algorithm.train(sampler, n_epochs=algorithm_params.get('n_epochs', 1000)): logz.log_tabular('Iteration', epoch) for k, v in algorithm.get_statistics().items(): logz.log_tabular(k, v) for k, v in replay_pool.get_statistics().items(): logz.log_tabular(k, v) for k, v in sampler.get_statistics().items(): logz.log_tabular(k, v) logz.dump_tabular()
def train_SAC(env_name, exp_name, seed, reparametrize, two_qf, old_funct, logdir, debug, gpu): alpha = { 'Ant-v2': 0.1, 'HalfCheetah-v2': 0.2, 'Hopper-v2': 0.2, 'Humanoid-v2': 0.05, 'Walker2d-v2': 0.2, }.get(env_name, 0.2) algorithm_params = { 'alpha': alpha, 'batch_size': 256, 'discount': 0.99, 'learning_rate': 1e-3, 'reparameterize': reparametrize, 'tau': 0.01, 'epoch_length': 1000, 'n_epochs': 500, 'two_qf': two_qf, } sampler_params = { 'max_episode_length': 1000, 'prefill_steps': 1000, } replay_pool_params = { 'max_size': 1e6, } value_function_params = { 'hidden_layer_sizes': (128, 128), } q_function_params = { 'hidden_layer_sizes': (128, 128), } policy_params = { 'hidden_layer_sizes': (128, 128), } logz.configure_output_dir(logdir) params = { 'exp_name': exp_name, 'env_name': env_name, 'algorithm_params': algorithm_params, 'sampler_params': sampler_params, 'replay_pool_params': replay_pool_params, 'value_function_params': value_function_params, 'q_function_params': q_function_params, 'policy_params': policy_params } logz.save_params(params) env = gym.envs.make(env_name) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) sampler = utils.SimpleSampler(**sampler_params) replay_pool = utils.SimpleReplayPool( observation_shape=env.observation_space.shape, action_shape=env.action_space.shape, **replay_pool_params) q_function = nn.QFunction(name='q_function', **q_function_params) if algorithm_params.get('two_qf', False): q_function2 = nn.QFunction(name='q_function2', **q_function_params) else: q_function2 = None value_function = nn.ValueFunction(name='value_function', **value_function_params) target_value_function = nn.ValueFunction(name='target_value_function', **value_function_params) policy = nn.GaussianPolicy( action_dim=env.action_space.shape[0], reparameterize=algorithm_params['reparameterize'], old_funct=old_funct, **policy_params) sampler.initialize(env, policy, replay_pool) algorithm = SAC(**algorithm_params) gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list=gpu) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options) with tf.Session(config=tf_config) as sess: if debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess) algorithm.build(env=env, policy=policy, q_function=q_function, q_function2=q_function2, value_function=value_function, target_value_function=target_value_function) for epoch in algorithm.train(sampler, session=sess, n_epochs=algorithm_params.get( 'n_epochs', 1000)): logz.log_tabular('Iteration', epoch) for k, v in algorithm.get_statistics().items(): logz.log_tabular(k, v) for k, v in replay_pool.get_statistics().items(): logz.log_tabular(k, v) for k, v in sampler.get_statistics().items(): logz.log_tabular(k, v) logz.dump_tabular()
epsilon_start = 1.0 epsilon_final = 0.1 epsilon_decay = 1200000 epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) # Worker Process Queues output_queue = mp.Queue(maxsize=args.pop) params_queue = mp.Queue(maxsize=args.pop) elite_queue = mp.Queue(maxsize=int(2 * args.pop)) # Agent agent = SAC(STATE_DIM, ACTION_DIM, args) policy_checkpoint = torch.load(checkpoint_name + '/actor.pth.tar') agent.policy.load_state_dict(policy_checkpoint['model_state_dict']) sac_episodes = args.sac_episodes # Memory memory = ReplayMemory(args.replay_size) processes = [] elite_list = [] # Training Loop total_numsteps = 0 updates = 0 time_list = [] max_rewards = [] min_rewards = []