def main(args): """ Train and save the SAC model, for the halfcheetah problem :param args: (ArgumentParser) the input arguments """ env = gym.make(args.env) test_env = gym.make(args.env) if args.ent_coef is None: args.ent_coef = 'auto' model = SAC(env=env, test_env=test_env, seed=int(args.seed), ent_coef=args.ent_coef, reward_scale=5.) ep_rewards = model.learn(total_timesteps=int(args.max_timesteps), save_path=args.save_path) model.save(args.save_path + "/%s_model_seed%d_fin_auto.zip" % (args.env, int(args.seed))) np.save( args.save_path + "/%s_rews_seed%d_fin_auto.npy" % (args.env, int(args.seed)), np.array(ep_rewards)) # print("Saving model to halfcheetah_model.zip") # model.learn(total_timesteps=100) # model.load("halfcheetah_model.zip") model.evaluate(10)
def __init__(self): self.observation_reward = rospy.Subscriber("/rl/environment_response", reward_observation, self.policy, queue_size=10) self.act_pub = rospy.Publisher("/rl/final_action", action_agent, queue_size=10) self.prev_state = None self.state = None self.reward = None self.final_state = None self.agent = SAC()
def run(args): env = gym.make(args.env) device = torch.device(args.device) # 1. Set some necessary seed. torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) random.seed(args.seed) env.seed(args.seed) # 2. Create nets. state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] hidden_sizes = (256, 256) ac = ActorCritic(state_size, action_size, hidden_sizes).to(device) ac_target = ActorCritic(state_size, action_size, hidden_sizes).to(device) hard_update(ac, ac_target) # env_sampler = EnvSampler(env, max_episode_step=4000, capacity=1e6) env_sampler = EnvSampler2(env, gamma=args.gamma1, capacity=1e6) alg = SAC(ac, ac_target, gamma=args.gamma2, alpha=0.2, q_lr=1e-3, pi_lr=1e-3, target_lr=5e-3, device=device) def get_action(state): state = torch.FloatTensor(state).unsqueeze(0).to(device) return ac_target.get_action(state) def get_mean_action(state): state = torch.FloatTensor(state).unsqueeze(0).to(device) return ac_target.get_action(state, deterministic=True) start_time = time() for _ in range(args.start_steps): env_sampler.addSample() print("Warmup uses {}s.".format(time() - start_time)) for step in range(1, args.total_steps + 1): env_sampler.addSample(get_action) if step % args.update_every == 0: for _ in range(args.update_every): batch = env_sampler.sample(args.batch_size) losses = alg.update(*batch) if step % args.test_every == 0: test_reward = env_sampler.test(get_mean_action) yield (step, test_reward, *losses) torch.save(ac.pi.state_dict(), './env_{}_pi_net.pth.tar'.format(args.env))
def ros_init(self): if self.team == 'A': self.agent = SAC(act_dim=2, obs_dim=6, lr_actor=l_rate*(1e-3), lr_value=l_rate*(1e-3), gamma=0.99, tau=0.995) rospy.init_node('strategy_node_A', anonymous=True) # self.A_info_pub = rospy.Publisher('/nubot1/A_info', Float32MultiArray, queue_size=1) # 3in1 self.vel_pub = rospy.Publisher('/nubot1/nubotcontrol/velcmd', VelCmd, queue_size=1) self.reset_pub = rospy.Publisher('/gazebo/set_model_state', ModelState, queue_size=10) # self.ready2restart_pub = rospy.Publisher('nubot1/ready2restart',Bool, queue_size=1) rospy.Subscriber("/nubot1/omnivision/OmniVisionInfo", OminiVisionInfo, self.callback) rospy.Subscriber('gazebo/model_states', ModelStates, self.fly_callback) # rospy.Subscriber('/coach/state', String, self.state_callback) # rospy.Subscriber('/coach/reward', Float32, self.reward_callback) # rospy.Subscriber('/coach/done', Bool, self.done_callback) # rospy.Subscriber('coach/HowEnd', Int16, self.HowEnd_callback) # rospy.Subscriber("/rival1/steal", Bool, self.steal_callback) rospy.wait_for_service('/nubot1/Shoot') self.call_Shoot = rospy.ServiceProxy('/nubot1/Shoot', Shoot) # rospy.wait_for_service('/gazebo/reset_simulation') # self.call_restart = rospy.ServiceProxy('/gazebo/reset_simulation', Empty, persistent=True) # rospy.wait_for_service('/gazebo/set_model_state') # self.call_set_modol = rospy.ServiceProxy('/gazebo/set_model_state', SetModelState) rospy.wait_for_service('/nubot1/BallHandle') self.call_Handle = rospy.ServiceProxy('/nubot1/BallHandle', BallHandle) rospy.wait_for_service('/rival1/BallHandle') self.call_B_Handle = rospy.ServiceProxy('/rival1/BallHandle', BallHandle) elif self.team == 'B': rospy.init_node('strategy_node_B', anonymous=True) self.vel_pub = rospy.Publisher('/rival1/nubotcontrol/velcmd', VelCmd, queue_size=1) self.steal_pub = rospy.Publisher('/rival1/steal', Bool, queue_size=1) # steal rospy.Subscriber("/rival1/omnivision/OmniVisionInfo", OminiVisionInfo, self.callback) rospy.Subscriber("/rival1/omnivision/OmniVisionInfo/GoalInfo", PPoint, self.GoalInfo) rospy.wait_for_service('/rival1/BallHandle') self.call_Handle = rospy.ServiceProxy('/rival1/BallHandle', BallHandle) else : rospy.init_node('coach', anonymous=True) self.state_pub = rospy.Publisher('/coach/state', String, queue_size=1) self.reward_pub = rospy.Publisher('/coach/reward', Float32, queue_size=1) self.done_pub = rospy.Publisher('coach/done', Bool, queue_size=1) self.HowEnd_pub = rospy.Publisher('coach/HowEnd', Int16, queue_size=1) rospy.Subscriber("/nubot1/omnivision/OmniVisionInfo", OminiVisionInfo, self.callback) rospy.Subscriber("/rival1/steal", Bool, self.steal_callback) # steal rospy.Subscriber("/nubot1/A_info", Float32MultiArray, self.A_info_callback) # rospy.Subscriber('gazebo/model_states', ModelStates, self.fly_callback) rospy.Subscriber('nubot1/ready2restart',Bool , self.ready2restart_callback) rospy.wait_for_service('/gazebo/reset_simulation') self.call_restart = rospy.ServiceProxy('/gazebo/reset_simulation', Empty)
def main(args): env = gym.make('Carla-v0', n_heroes=N_HEROES, port=PORT) replay = MultiReplayBuffer(CAPACITY) from sac import SAC import torch import bz_utils as bzu bzu.log.init('log_v1') updates = 0 trainer = SAC(OBSERVATION_SHAPE, N_ACTIONS, args) agent = trainer.policy # agent.load_state_dict(torch.load('log/latest.t7')) for _ in tqdm.tqdm(range(1000)): totals = [0 for _ in range(N_HEROES)] finished = list() states = env.reset(n_vehicles=N_VEHICLES, n_pedestrians=N_PEDESTRIANS) for i in tqdm.tqdm(range(1000), desc='Experiences'): _, _, actions = agent.sample(preprocess(states)) actions = actions.detach().cpu().numpy() new_states, rewards, dones, infos = env.step(actions) for j in range(N_HEROES): totals[j] += rewards[j] if dones[j]: finished.append(totals[j]) totals[j] = 0 # env.render() replay.add(states, actions, rewards, new_states, dones) states = new_states for j in range(N_HEROES): totals[j] += rewards[j] finished.append(totals[j]) bzu.log.scalar(is_train=True, **{'cumulative': np.mean(finished)}) for i in tqdm.tqdm(range(1000), desc='Batch'): loss_q1, loss_q2, p_loss, a_loss, a_tlog = trainer.update_parameters( replay, args.batch_size, updates) scalars = { 'loss_q1': loss_q1, 'loss_q2': loss_q2, 'p_loss': p_loss, 'a_loss': a_loss, 'a_tlog': a_tlog, } bzu.log.scalar(is_train=True, **scalars) updates += 1 bzu.log.end_epoch(agent)
def test_dpf_sac(d_path, s_path, threshold=0.02): results = [] for _ in tqdm(range(10)): env = gym.make('ActivePerception-v0') env.sid = 9900 # test dpf = DPF().to(device) dpf.load_model(d_path) sac = SAC(24) sac.load_model(s_path) reward = 0 for episode in tqdm(range(100)): scene_data, obs = env.reset(False) s = get_state(scene_data).to(device) # [1, n_obj, dim_obj] state o = trans_rgb(obs['o']).to(device) # [1, C, H, W] rgb d = trans_d(obs['d']).to(device) # [1, 1, H, W] depth p, w, p_n, x, h = dpf(o, d, n_new=K) mean, var = get_variance(p, w) h_numpy = torch.cat((mean, var), -1).view(-1).detach().cpu().numpy() steps = np.random.choice(7, 7, 0) + 1 for step in steps: #th = np.random.rand()*np.pi*2-np.pi #th = np.pi/4*step th = sac.policy_net.get_action(h_numpy.reshape(1, -1)).item() obs = env.step(th) o = trans_rgb(obs['o']).to(device) d = trans_d(obs['d']).to(device) th = torch.FloatTensor([th]).view(1, -1).to(device) n_new = int(0.7 * K) #int(K*(0.5**(_+1))) p, w, p_n, x, h = dpf(o, d, th, p, w, h, n_new, True) mean, var = get_variance(p, w) h_numpy = torch.cat((mean, var), -1).view(-1).detach().cpu().numpy() p_ = (F.softmax(w, 1).unsqueeze(2).unsqueeze(3) * p).sum(1) mse = F.mse_loss(p_, s).item() d = (mse < threshold) r = 8 if d else -1 reward += r if d: break results.append(reward / 100) results = np.array(results) print("DPF SAC avg reward %10.4f | std %10.4f" % (np.mean(results), np.std(results)))
def get_policy(buffer, model, measure, mode, d_state, d_action, policy_replay_size, policy_batch_size, policy_active_updates, policy_n_hidden, policy_lr, policy_gamma, policy_tau, policy_explore_alpha, policy_exploit_alpha, buffer_reuse, device, verbosity, _log): if verbosity: _log.info("... getting fresh agent") policy_alpha = policy_explore_alpha if mode == 'explore' else policy_exploit_alpha agent = SAC(d_state=d_state, d_action=d_action, replay_size=policy_replay_size, batch_size=policy_batch_size, n_updates=policy_active_updates, n_hidden=policy_n_hidden, gamma=policy_gamma, alpha=policy_alpha, lr=policy_lr, tau=policy_tau) agent = agent.to(device) agent.setup_normalizer(model.normalizer) if not buffer_reuse: return agent if verbosity: _log.info("... transferring exploration buffer") size = len(buffer) for i in range(0, size, 1024): j = min(i + 1024, size) s, a = buffer.states[i:j], buffer.actions[i:j] ns = buffer.states[i:j] + buffer.state_deltas[i:j] s, a, ns = s.to(device), a.to(device), ns.to(device) with torch.no_grad(): mu, var = model.forward_all(s, a) r = measure(s, a, ns, mu, var, model) agent.replay.add(s, a, r, ns) if verbosity: _log.info("... transferred exploration buffer") return agent
def run(sdk_conn: cozmo.conn): """ Container of the main loop. It is necessary to work with Cozmo. This is called by the cozmo.connect presents in the main loop of this file. :param sdk_conn: SDK connection to Anki Cozmo :type sdk_conn: cozmo.conn :return: nothing :rtype: nothing """ gettrace = getattr(sys, 'gettrace', None) if gettrace is not None and gettrace(): debug = True else: debug = False os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' robot = sdk_conn.wait_for_robot() robot.enable_device_imu(True, True, True) # Turn on image receiving by the camera robot.camera.image_stream_enabled = True # Setting up Hyper-Parameters args, folder, logger, restore = initial_setup() # if not debug: # tb_tool = TensorBoardTool(folder) # tb_tool.run() logger.debug("Initial setup completed.") # Create JSON of Hyper-Parameters for reproducibility with open(folder + "hp.json", 'w') as outfile: json.dump(vars(args), outfile) # Initialize Environment gym_cozmo.initialize(robot, args.img_h, args.img_w) env = gym.make(args.env_name) # Setup the agent agent = SAC(args.state_buffer_size, env.action_space, env, args, folder, logger) i_run = args.run i_epi = args.episode agent.load_model_to_play(args.env_name, folder, i_run, i_epi) agent.play() env.close() logger.important("Program closed correctly!")
def test_rnn_sac(r_path, s_path, threshold=0.02): rnn = RNNFilter().to(device) rnn.load_model(r_path) sac = SAC() sac.load_model(s_path) results = [] for _ in tqdm(range(10)): env = gym.make('ActivePerception-v0') env.sid = 9900 # test reward = 0 for episode in range(100): scene_data, obs = env.reset(False) s = get_state(scene_data).to(device) # [1, n_obj, dim_obj] state o = trans_rgb(obs['o']).to(device) # [1, C, H, W] rgb d = trans_d(obs['d']).to(device) # [1, 1, H, W] depth s_, h = rnn(o, d) h_numpy = h.view(-1).detach().cpu().numpy() steps = np.random.choice(7, 7, 0) + 1 #for step in range(7): # n_actions allowed for step in steps: #th = 2*np.pi*np.random.rand()-np.pi th = sac.policy_net.get_action(h_numpy.reshape(1, -1)).item() #th = np.pi/4*step obs = env.step(th) o = trans_rgb(obs['o']).to(device) d = trans_d(obs['d']).to(device) th = torch.FloatTensor([th]).view(1, -1).to(device) s_, h = rnn(o, d, th, h) h_numpy = h.view(-1).detach().cpu().numpy() mse = F.mse_loss(s_, s).item() d = (mse < threshold) r = 8 if d else -1 reward += r if d: break results.append(reward / 100) results = np.array(results) print("RNN SAC avg reward %10.4f | std %10.4f" % (np.mean(results), np.std(results)))
def test(arglist): env_name = arglist.env train_seed = arglist.train_seed test_seed = arglist.test_seed n_episodes = arglist.n_episodes render = arglist.render max_timesteps = 1001 #env = gym.make(env_name) env = gen_envs(arglist) # Set random seed env.seed(test_seed) torch.manual_seed(test_seed) np.random.seed(test_seed) # load pretrained RL models agent = SAC(env.observation_space.shape[0], env.action_space, arglist) agent.load_model(env_name, train_seed) total_reward_list = [] for ep in range(1, n_episodes+1): ep_reward = 0.0 state = env.reset() for t in range(max_timesteps): noise = np.random.normal(0.0, 1.0, size=state.shape) noise = np.clip(noise, -1.0, 1.0) adv_state = state + arglist.noise_scale * noise action = agent.select_action(adv_state, eval=True) state, reward, done, _ = env.step(action) ep_reward += reward if render: env.render() if done: break #print('Episode: {}\tReward: {}'.format(ep, int(ep_reward))) total_reward_list.append(ep_reward) ep_reward = 0.0 env.close() return total_reward_list
def main(args): env = gym.make(args['env_name']) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') action_dim = env.action_space.shape[0] max_action = env.action_space.high[0] state_dim = env.observation_space.shape[0] sac = SAC(args, action_dim, max_action, state_dim, device) summary = tensorboardX.SummaryWriter('./log/{}_sac_{}'.format(args['env_name'], args['noise_type'])) timestep = 0 start_time = time.time() for episode in range(args['max_episode']): episode_reward = 0 state = env.reset() while True: action = sac.get_action(state) next_state, reward, done, info = env.step(action) sac.save(state, action, reward, next_state, int(done)) episode_reward += reward state = next_state timestep += 1 if sac.memory_counter > args['batch_size']: # BATCH_SIZE(64) 이상일 때 부터 train 시작 sac.train() if done: print('episode: ', episode, ' reward : %.3f'%(episode_reward), ' timestep :', timestep) summary.add_scalar('reward/episode', episode_reward, episode) break if episode % args['save_freq'] == 0: if not os.path.exists('./SaveModel') : os.mkdir('./SaveModel') torch.save(sac.actor.state_dict(), './SaveModel/{}_sac_{}_{}'.format(args['env_name'], args['noise_type'], episode))
def experiment(variant): print('CUDA status:', torch.cuda.is_available()) env = make_env(variant['env']) # Set seeds variant['seed'] = int(variant['seed']) env.seed(int(variant['seed'])) torch.manual_seed(int(variant['seed'])) np.random.seed(int(variant['seed'])) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) kwargs = {"state_dim": state_dim, "action_dim": action_dim, "max_action": max_action, "discount": variant['discount'], "tau": variant['tau'], 'network_class': NETWORK_CLASSES[variant['network_class']]} # custom network kwargs mlp_network_kwargs = dict(n_hidden=variant['n_hidden'], hidden_dim=variant['hidden_dim'], first_dim=variant['first_dim']) dropout_mlp_network_kwargs = dict(n_hidden=variant['n_hidden'], hidden_dim=variant['hidden_dim'], first_dim=variant['first_dim'], dropout_p=variant['dropout_p']) variable_init_mlp_network_kwargs = dict(n_hidden=variant['n_hidden'], hidden_dim=variant['hidden_dim'], first_dim=variant['first_dim'], sigma=variant['sigma']) fourier_network_kwargs = dict(n_hidden=variant['n_hidden'], hidden_dim=variant['hidden_dim'], fourier_dim=variant['fourier_dim'], sigma=variant['sigma'], concatenate_fourier=variant['concatenate_fourier'], train_B=variant['train_B']) siren_network_kwargs = dict(n_hidden=variant['n_hidden'], hidden_dim=variant['hidden_dim'], first_omega_0=variant['omega'], hidden_omega_0=variant['omega']) if variant['network_class'] in {'MLP', 'D2RL', 'ConcatMLP', 'SpectralMLP'}: kwargs['network_kwargs'] = mlp_network_kwargs elif variant['network_class'] == 'DropoutMLP': kwargs['network_kwargs'] = dropout_mlp_network_kwargs elif variant['network_class'] == 'VariableInitMLP': kwargs['network_kwargs'] = variable_init_mlp_network_kwargs elif variant['network_class'] in {'FourierMLP', 'LogUniformFourierMLP'}: kwargs['network_kwargs'] = fourier_network_kwargs elif variant['network_class'] == 'Siren': kwargs['network_kwargs'] = siren_network_kwargs else: raise NotImplementedError # Initialize policy if variant['policy'] == "TD3": # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = variant['policy_noise * max_action'] kwargs["noise_clip"] = variant['noise_clip * max_action'] kwargs["policy_freq"] = variant['policy_freq'] policy = TD3.TD3(**kwargs) elif variant['policy'] == "OurDDPG": policy = OurDDPG.DDPG(**kwargs) elif variant['policy'] == "DDPG": policy = DDPG.DDPG(**kwargs) elif variant['policy'] == "SAC": kwargs['lr'] = variant['lr'] kwargs['alpha'] = variant['alpha'] kwargs['automatic_entropy_tuning'] = variant['automatic_entropy_tuning'] kwargs['weight_decay'] = variant['weight_decay'] # left out dmc policy = SAC(**kwargs) elif 'PytorchSAC' in variant['policy']: kwargs['action_range'] = [float(env.action_space.low.min()), float(env.action_space.high.max())] kwargs['actor_lr'] = variant['lr'] kwargs['critic_lr'] = variant['lr'] kwargs['alpha_lr'] = variant['alpha_lr'] kwargs['weight_decay'] = variant['weight_decay'] kwargs['no_target'] = variant['no_target'] kwargs['mlp_policy'] = variant['mlp_policy'] kwargs['mlp_qf'] = variant['mlp_qf'] del kwargs['max_action'] if variant['policy'] == 'PytorchSAC': policy = PytorchSAC(**kwargs) elif variant['policy'] == 'RandomNoisePytorchSAC': kwargs['noise_dist'] = variant['noise_dist'] kwargs['noise_scale'] = variant['noise_scale'] policy = RandomNoiseSACAgent(**kwargs) elif variant['policy'] == 'SmoothedPytorchSAC': kwargs['n_critic_samples'] = variant['n_critic_samples'] kwargs['noise_dist'] = variant['noise_dist'] kwargs['noise_scale'] = variant['noise_scale'] policy = SmoothedSACAgent(**kwargs) elif variant['policy'] == 'FuncRegPytorchSAC': kwargs['critic_target_update_frequency'] = variant['critic_freq'] kwargs['fr_weight'] = variant['fr_weight'] policy = FuncRegSACAgent(**kwargs) else: raise NotImplementedError if variant['load_model'] != "": raise RuntimeError # load replay buffer replay_buffer = torch.load(os.path.join(variant['replay_buffer_folder'], 'generated_replay_buffer.pt')) policy_optimizer = torch.optim.Adam(policy.actor.parameters(), lr=variant['lr']) qf_optimizer = torch.optim.Adam(policy.critic.Q1.parameters(), lr=variant['lr']) # split into train and val for both action and q_value indices = np.arange(replay_buffer.max_size) random.shuffle(indices) train_indices = indices[:int(0.9 * len(indices))] val_indices = indices[int(0.9 * len(indices)):] train_dataset = torch.utils.data.TensorDataset(torch.tensor(replay_buffer.state[train_indices]).float(), torch.tensor(replay_buffer.action[train_indices]).float(), torch.tensor(replay_buffer.correct_action[train_indices]).float(), torch.tensor(replay_buffer.q_value[train_indices]).float()) val_dataset = torch.utils.data.TensorDataset(torch.tensor(replay_buffer.state[val_indices]).float(), torch.tensor(replay_buffer.action[val_indices]).float(), torch.tensor(replay_buffer.correct_action[val_indices]).float(), torch.tensor(replay_buffer.q_value[val_indices]).float()) # train a network on it train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=variant['batch_size'], shuffle=True, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=variant['batch_size'], shuffle=True, pin_memory=True) train_q_losses = [] train_policy_losses = [] val_q_losses = [] val_policy_losses = [] for _ in trange(variant['n_train_epochs']): total_q_loss = 0 total_policy_loss = 0 for (state, action, correct_action, q) in train_loader: state = state.to(DEVICE) action = action.to(DEVICE) correct_action = correct_action.to(DEVICE) q = q.to(DEVICE) q_preds = policy.critic.Q1(torch.cat([state, action], dim=-1)) policy_preds = policy.actor(state).mean q_loss = F.mse_loss(q_preds, q) policy_loss = F.mse_loss(policy_preds, correct_action) qf_optimizer.zero_grad() policy_optimizer.zero_grad() q_loss.backward() policy_loss.backward() qf_optimizer.step() policy_optimizer.step() total_q_loss += q_loss.item() total_policy_loss += policy_loss.item() # get validation stats total_val_q_loss = 0 total_val_policy_loss = 0 with torch.no_grad(): for (state, action, correct_action, q) in val_loader: state = state.to(DEVICE) action = action.to(DEVICE) correct_action = correct_action.to(DEVICE) q = q.to(DEVICE) q_preds = policy.critic.Q1(torch.cat([state, action], dim=-1)) policy_preds = policy.actor(state).mean q_loss = F.mse_loss(q_preds, q) policy_loss = F.mse_loss(policy_preds, correct_action) total_val_q_loss += q_loss.item() total_val_policy_loss += policy_loss.item() train_q_losses.append(total_q_loss / len(train_loader)) train_policy_losses.append(total_policy_loss / len(train_loader)) val_q_losses.append(total_val_q_loss / len(val_loader)) val_policy_losses.append(total_val_policy_loss / len(val_loader)) print(f'train: qf loss: {train_q_losses[-1]:.4f}, policy loss: {train_policy_losses[-1]:.4f}') print(f'val: qf loss: {val_q_losses[-1]:.4f}, policy loss: {val_policy_losses[-1]:.4f}') # evaluate the resulting policy for 100 episodes eval_return = eval_policy(policy, variant['env'], variant['seed'], eval_episodes=variant['eval_episodes']) # save the results to_save = dict( train_q_losses=train_q_losses, train_policy_losses=train_policy_losses, val_q_losses=val_q_losses, val_policy_losses=val_policy_losses, eval_return=eval_return, qf=policy.critic.Q1.state_dict(), policy=policy.actor.state_dict() ) torch.save(to_save, os.path.join(variant['replay_buffer_folder'], f'{variant["network_class"]}_distillation.pt'))
# Environment from fetch_env import fetch_env env = UnityEnvironment(fetch_env(args.env, args.system)) default_brain = env.brain_names[0] brain = env.brains[default_brain] torch.manual_seed(args.seed) np.random.seed(args.seed) # Agent num_worker = 11 state_dim = 1060 high = np.ones(39) action_dim = spaces.Box(-high, high, dtype=np.float32) agent = SAC(state_dim, action_dim, args) #TesnorboardX writer = SummaryWriter(logdir='runs/{}_SAC_{}_{}_{}'.format( datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env, args.policy, "autotune" if args.automatic_entropy_tuning else "")) # Training Loop total_numsteps = 0 agent_reward = np.zeros(num_worker) buffer_reward = np.zeros(num_worker) done = False env_info = env.reset(train_mode=True)[default_brain] states = env_info.vector_observations
def train_SAC(env_name, exp_name, n_iter, ep_len, seed, logdir, alpha, prefill_steps, discount, batch_size, learning_rate, tau, two_qf): alpha = { 'Ant-v2': 0.1, 'HalfCheetah-v2': 0.2, 'Hopper-v2': 0.2, 'Humanoid-v2': 0.05, 'Walker2d-v2': 0.2, }.get(env_name, alpha) algorithm_params = { 'alpha': alpha, 'batch_size': batch_size, 'discount': discount, 'learning_rate': learning_rate, 'reparameterize': True, 'tau': tau, 'epoch_length': ep_len, 'n_epochs': n_iter, 'two_qf': two_qf, } sampler_params = { 'max_episode_length': 1000, 'prefill_steps': prefill_steps, } replay_pool_params = { 'max_size': 1e6, } value_function_params = { 'hidden_layer_sizes': (64, 64), } q_function_params = { 'hidden_layer_sizes': (64, 64), } policy_params = { 'hidden_layer_sizes': (64, 64), } logz.configure_output_dir(logdir) params = { 'exp_name': exp_name, 'env_name': env_name, 'algorithm_params': algorithm_params, 'sampler_params': sampler_params, 'replay_pool_params': replay_pool_params, 'value_function_params': value_function_params, 'q_function_params': q_function_params, 'policy_params': policy_params } logz.save_params(params) env = gym.envs.make(env_name) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) sampler = utils.SimpleSampler(**sampler_params) replay_pool = utils.SimpleReplayPool( observation_shape=env.observation_space.shape, action_shape=env.action_space.shape, **replay_pool_params) q_function = nn.QFunction(name='q_function', **q_function_params) if algorithm_params.get('two_qf', False): q_function2 = nn.QFunction(name='q_function2', **q_function_params) else: q_function2 = None value_function = nn.ValueFunction( name='value_function', **value_function_params) target_value_function = nn.ValueFunction( name='target_value_function', **value_function_params) policy = nn.GaussianPolicy( action_dim=env.action_space.shape[0], reparameterize=algorithm_params['reparameterize'], **policy_params) sampler.initialize(env, policy, replay_pool) algorithm = SAC(**algorithm_params) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) tf_config.gpu_options.allow_growth = True # may need if using GPU with tf.Session(config=tf_config): algorithm.build( env=env, policy=policy, q_function=q_function, q_function2=q_function2, value_function=value_function, target_value_function=target_value_function) for epoch in algorithm.train(sampler, n_epochs=algorithm_params.get('n_epochs', 1000)): logz.log_tabular('Iteration', epoch) for k, v in algorithm.get_statistics().items(): logz.log_tabular(k, v) for k, v in replay_pool.get_statistics().items(): logz.log_tabular(k, v) for k, v in sampler.get_statistics().items(): logz.log_tabular(k, v) logz.dump_tabular()
epsilon_start = 1.0 epsilon_final = 0.1 epsilon_decay = 1200000 epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) # Worker Process Queues output_queue = mp.Queue(maxsize=args.pop) params_queue = mp.Queue(maxsize=args.pop) elite_queue = mp.Queue(maxsize=int(2 * args.pop)) # Agent agent = SAC(STATE_DIM, ACTION_DIM, args) policy_checkpoint = torch.load(checkpoint_name + '/actor.pth.tar') agent.policy.load_state_dict(policy_checkpoint['model_state_dict']) sac_episodes = args.sac_episodes # Memory memory = ReplayMemory(args.replay_size) processes = [] elite_list = [] # Training Loop total_numsteps = 0 updates = 0 time_list = [] max_rewards = [] min_rewards = []
def run_session(db_name, max_session_length, sweep, session, model_name, params): alg = SAC(params) car = Car() car.reset() training_after_episodes = params["training_after_episodes"] episode = 0 random_episodes = params["random_episodes"] max_episode_length = params["max_episode_length"] THROTTLE_MAX = params["throttle_max"] THROTTLE_MIN = params["throttle_min"] STEER_LIMIT_LEFT = -1 STEER_LIMIT_RIGHT = 1 action_space = spaces.Box(low=np.array([STEER_LIMIT_LEFT, -1]), high=np.array([STEER_LIMIT_RIGHT, 1]), dtype=np.float32) for i in range(max_session_length): episode += 1 throttle = 0.15 try: step = 0 state = car.reset() time.sleep(1) state = car.step([0, 0.01]) #print(state) state = alg.process_image(state) state = np.stack((state, state, state, state), axis=0) episode_buffer = EpisodeBuffer(alg.horizon, alg.discount) episode_reward = 0 while step < max_episode_length: t = time.time_ns() step += 1 temp = state[np.newaxis, :] if episode < random_episodes: action = action_space.sample() else: action = alg.select_action(temp) #action[1] = max(THROTTLE_MIN, min(THROTTLE_MAX, action[1])) action[0] = max(STEER_LIMIT_LEFT, min(STEER_LIMIT_RIGHT, action[0])) throttle += action[1] / 100.0 throttle = max(THROTTLE_MIN, min(THROTTLE_MAX, throttle)) action[1] = throttle action[1] = 0.3 next_state = car.step(action) im = next_state darkness = len(im[(im > 120) * (im < 130)]) if darkness < 2500: # < len(im[(im > 160) * (im < 170)]): raise KeyboardInterrupt next_state = alg.process_image(next_state) reward = (throttle - THROTTLE_MIN) / (THROTTLE_MAX - THROTTLE_MIN) reward = darkness / 7000 image_to_ascii(next_state[::2].T) episode_reward += reward print( "Sweep: {}, Episode: {}, Step: {}, Episode reward: {:.2f}, Step reward: {:.2f}" .format(sweep, episode, step, episode_reward, reward)) not_done = 1.0 next_state = next_state[np.newaxis, :] next_state = np.vstack((state[:3, :, :], next_state)) out = episode_buffer.add( [state, action, [reward], next_state, [not_done]]) last = [state, action, [reward], next_state, [not_done]] alg.push_buffer(last) #if out: #alg.push_buffer(out) state = next_state if len(alg.replay_buffer) > alg.batch_size: alg.update_parameters() tn = time.time_ns() #sync with the network time.sleep(max(0, 0.1 - (tn - t) / 1e9)) raise KeyboardInterrupt except KeyboardInterrupt: last[4] = [0] alg.push_buffer(last) car.reset() #if episode % 5 == 0: #print("Saving chekcpoint") #torch.save(alg, "sac_model_checkpoint.pth") print("Calculating reward") # episode_buffer = episode_buffer.as_list() # for i in range(len(episode_buffer)): # reward = 0 # for j in range(min(len(episode_buffer) - i, alg.horizon)): # reward += alg.discount**j * episode_buffer[i + j][2][0] # norm = (1 - alg.discount**alg.horizon) / (1 - alg.discount) # e = episode_buffer[i] # e[2] = [reward / norm] # if i == len(episode_buffer) - 1: # e[-1][0] = 0.0 # alg.push_buffer(e) if len(alg.replay_buffer) > alg.batch_size: print("Training") for i in range(training_after_episodes): alg.update_parameters() db.insert_episode(db_name, session, episode, step, episode_reward) time.sleep(5)
def readCSMNC(fileName): # open files fn = open(fileName, 'r') # magic number, ns, number of splited string # counted of the *.dat file download from CSMNC ns = 59 tmp1 = fn.read() tmp2 = tmp1.split(maxsplit=59) tmpData = tmp2[59] # output the longitude and latitude of station to stationList.dat tmpLat = tmp2[18] tmpLong = tmp2[19] lat = tmpLat[0:6] lon = tmpLong[0:7] # magic number, td, duration time in seconds # pay attention to the type of variables!!! # td and step are floats # nps is an integer td = float(tmp2[52]) # time intervals, = 0.005 SEC step = float(tmp2[43]) # nps, number of data points nps = int(float(tmp2[38])) # start to distribute SAC head variables # component direction tmp2[30] # time interval delta = float(tmp2[43]) # number of points npts = nps # station name stnm = tmp2[17] # instrument inst = tmp2[25] # compant name cmpNm = tmp2[30] # cmpaz, 方位角; cmpinc,倾角 if (cmpNm == 'UD'): cmpaz = 0.0 cmpinc = 0.0 elif (cmpNm == 'EW'): cmpaz = 90.0 cmpinc = 90.0 elif (cmpNm == 'NS'): cmpaz = 0.0 cmpinc = 90.0 else: raise ValueError('wrong component direction', cmpNm) # event name evnm = tmp2[5] # origin time ot = str(2) + tmp2[1] nzyear = int(ot[0:4]) mon = int(ot[4:6]) day = int(ot[6:8]) nzhour = int(ot[8:10]) nzmin = int(ot[10:12]) nzsec = int(ot[12:]) nzjday = ymd2jday(nzyear, mon, day) nzmsec = 0 # event latitude evla = float(tmp2[9][0:6]) # event longitude evlo = float(tmp2[10][0:7]) # event depth evdp = float(tmp2[12]) # magnitude mag = float(tmp2[15][0:2]) # latitude and longitude stla = lat stlo = lon iftype = 1 # linspace default include start and stop # np.linspace(start, stop, number of samples) times = np.linspace(step, td, npts) tmp3 = tmpData.split(maxsplit=npts) tmp4 = np.asfarray(tmp3) # create null SAC object head = SAC() # set head variables head.set_ot(nzyear, nzjday, nzhour, nzmin, nzsec, nzmsec) head.set_magtyp(53) head.set_dep(8) head.set_ovrok(1) head.set_cmp(cmpaz, cmpinc) head.set_evdp(evdp) head.set_evla(evla) head.set_evlo(evlo) head.set_kevnm(evnm) head.set_mag(mag) head.set_kinst(inst) head.set_kstnm(stnm) head.set_stloc(stla, stlo) # set nessary head variables' values head.set_iftype(iftype) head.set_npts(npts) head.set_delta(delta) head.set_leven() head.set_be(step, td) head.set_nvhdr() # close all files fn.close() return head, tmp4
def main(): parser = argparse.ArgumentParser(description='PyTorch REINFORCE example') parser.add_argument('--env-name', default="HalfCheetah-v2", help='name of the environment to run') parser.add_argument('--policy', default="Gaussian", help='algorithm to use: Gaussian | Deterministic') parser.add_argument('--eval', type=bool, default=True, help='Evaluates a policy a policy every 10 episode (default:True)') parser.add_argument('--gamma', type=float, default=0.99, metavar='G', help='discount factor for reward (default: 0.99)') parser.add_argument('--tau', type=float, default=0.005, metavar='G', help='target smoothing coefficient(τ) (default: 0.005)') parser.add_argument('--lr', type=float, default=0.0003, metavar='G', help='learning rate (default: 0.0003)') parser.add_argument('--alpha', type=float, default=0.2, metavar='G', help='Temperature parameter α determines the relative importance of the entropy term against the reward (default: 0.2)') parser.add_argument('--automatic_entropy_tuning', type=bool, default=False, metavar='G', help='Temperature parameter α automaically adjusted.') parser.add_argument('--seed', type=int, default=456, metavar='N', help='random seed (default: 456)') parser.add_argument('--batch_size', type=int, default=256, metavar='N', help='batch size (default: 256)') parser.add_argument('--num_steps', type=int, default=2000001, metavar='N', help='maximum number of steps (default: 2000000)') parser.add_argument('--hidden_size', type=int, default=256, metavar='N', help='hidden size (default: 256)') parser.add_argument('--updates_per_step', type=int, default=1, metavar='N', help='model updates per simulator step (default: 1)') parser.add_argument('--start_steps', type=int, default=10000, metavar='N', help='Steps sampling random actions (default: 10000)') parser.add_argument('--target_update_interval', type=int, default=1, metavar='N', help='Value target update per no. of updates per step (default: 1)') parser.add_argument('--replay_size', type=int, default=1000000, metavar='N', help='size of replay buffer (default: 10000000)') parser.add_argument('--cuda', action="store_true", help='run on CUDA (default: False)') parser.add_argument('--resume-name', default=None, help='Name of saved model to load') args, unknown = parser.parse_known_args() # Import custom envs import gym_match_input_continuous import deepdrive_2d # TesnorboardX run_name = '{}_SAC_{}_{}_{}'.format( datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name, args.policy, "autotune" if args.automatic_entropy_tuning else "") # Log to file os.makedirs('logs', exist_ok=True) log.add(f'logs/{run_name}.log') log.info(' '.join(sys.argv)) # Environment # env = NormalizedActions(gym.make(args.env_name)) env = gym.make(args.env_name) torch.manual_seed(args.seed) np.random.seed(args.seed) env.seed(args.seed) env.reset() # Agent agent = SAC(env.observation_space.shape[0], env.action_space, args) if args.resume_name: agent.load_model(f'{DIR}/models/sac_actor_runs/{args.resume_name}', f'{DIR}/models/sac_critic_runs/{args.resume_name}') run_name = 'runs/' + run_name writer = SummaryWriter(logdir=run_name) # Memory memory = ReplayMemory(args.replay_size) train(agent, args, env, memory, run_name, writer) env.close()
dest="continue_training", help="Continue training from a checkpoint") parser.add_argument("-r", "--render_testing", action="store_true", default=True, dest="render_testing", help="Render window when testing agent.") parser.add_argument("-n", "--num_test_games", action="store", default=1, type=int, dest="num_test_games", help="How many games to play when testing.") parser.add_argument("--version", action="version", version="PyTorch-SAC Version 0.1") args = parser.parse_args() sac = SAC(env_name=args.env_name, data_save_dir=os.path.join("runs", args.log_dir)) if not args.test: sac.train(resume_training=args.continue_training) else: sac.test(render=args.render_testing, use_internal_policy=False, num_games=args.num_test_games)
env.get_observation_space_size = new_get_observation_space_size env.observation_space = ([0] * env.get_observation_space_size(), [0] * env.get_observation_space_size()) env.observation_space = convert_to_gym(env.observation_space) # Create log dir for callback model saving os.makedirs("./temp_models/", exist_ok=True) env = Monitor(env, "./temp_models/", allow_early_resets=True) ##### TRAIN ##### if args.train: check_overwrite(args.model) model = SAC(MlpPolicy, env, verbose=1, tensorboard_log="./tensorboard_log/") model.learn(total_timesteps=int(args.step), log_interval=10, tb_log_name="log", callback=callback.callback) model.save(MODELS_FOLDER_PATH) #### TEST ##### if not args.train: model = SAC.load(MODELS_FOLDER_PATH) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, done, info = env.step(scale_range(action, -1, 1, 0, 1))
import gym from sac import SAC from utils.sac_runner import vector_train from utils.sac_runner import evaluate if __name__ == "__main__": env = gym.vector.make("Pendulum-v0", num_envs=4, asynchronous=True) actor = SAC(env.single_observation_space, env.single_action_space, p_lr=1e-3, q_lr=1e-3) returns = vector_train(actor, env, 40000, -200) eval_env = gym.make("Pendulum-v0") evaluate(actor, eval_env, 1, True)
memory_discount = 0.95 memory_horizon = 1 sac_params = { "linear_output": sac_input, "lr": 0.0003, "target_entropy": -2, "batch_size": 64, "hidden_size": 128 } # Create the controller for the Donkey env env = Car("kari_main", "mqtt.eclipse.org") env.reset() # Create the SAC agent to control the env agent = SAC(parameters=sac_params) # Create the state representation functionality if input("Load model?"): agent = torch.load("model.pth") throttle_weight_1 = 0.1 throttle_weight_2 = -5 STEER_LIMIT_LEFT = -1 STEER_LIMIT_RIGHT = 1 THROTTLE_MAX = 0.23 THROTTLE_MIN = 0.15 MAX_STEERING_DIFF = 0.5 action_space = spaces.Box(low=np.array([STEER_LIMIT_LEFT, THROTTLE_MIN]),
from gym import spaces from functions import process_image, image_to_ascii, rgb2gray from episode_buffer import EpisodeBuffer params = { "target_entropy": -4, "hidden_size": 64, "batch_size": 64, "discount": 0.95, "lr": 0.0001 } alg = SAC(parameters=params) car = Car(car="kari_main") car.reset() ## SAC hyperparameters ## Other hyperparameters training_after_episodes = 1 episode = 0 random_episodes = 5 cmd = input( "If you want to load a model, give model path, default last checkpoint.") if cmd != "":
def train_rnn_sac(path, threshold=0.02): env = gym.make('ActivePerception-v0') rnn = RNNFilter().to(device) rnn.load_model(path) sac = SAC() # set up the experiment folder experiment_id = "rsac_" + get_datetime() save_path = CKPT + experiment_id + ".pt" max_frames = 100000 frame_idx = 0 best_loss = np.inf pbar = tqdm(total=max_frames) stats = {'losses': []} best_reward = 0 avg_reward = 0 avg_mse = 0 episode = 0 while frame_idx < max_frames: pbar.update(1) episode += 1 env.sid = env.sid % 9900 scene_data, obs = env.reset(False) S, A, R, D = [], [], [], [] s = get_state(scene_data).to(device) # [1, n_obj, dim_obj] state o = trans_rgb(obs['o']).to(device) # [1, C, H, W] rgb d = trans_d(obs['d']).to(device) # [1, 1, H, W] depth s_, h = rnn(o, d) prev_mse = F.mse_loss(s_, s).item() h_numpy = h.view(-1).detach().cpu().numpy() S.append(h_numpy) for _ in range(7): frame_idx += 1 th = sac.policy_net.get_action(h_numpy.reshape(1, -1)) obs = env.step(th.item()) o = trans_rgb(obs['o']).to(device) d = trans_d(obs['d']).to(device) th = torch.FloatTensor([th]).view(1, -1).to(device) s_, h = rnn(o, d, th, h) mse = F.mse_loss(s_, s).item() #r = (mse - prev_mse)*100 prev_mse = mse d = (mse < threshold) r = 8 if d else -1 h_numpy = h.view(-1).detach().cpu().numpy() S.append(h_numpy) A.append(th.cpu().numpy().reshape(-1)) R.append(r) D.append(d) if d: break S, NS = S[:-1], S[1:] for s, a, r, ns, d in zip(S, A, R, NS, D): sac.replay_buffer.push(s, a, r, ns, d) if len(sac.replay_buffer) > batch_size: sac.soft_q_update(batch_size) avg_reward += np.array(R).sum() avg_mse += prev_mse if episode % 10 == 0: avg_reward /= 10 avg_mse /= 10 tqdm.write("[INFO] epi %05d | avg r: %10.4f | avg mse: %10.4f" % (episode, avg_reward, avg_mse)) if avg_reward > best_reward: best_reward = avg_reward sac.save_model(save_path) avg_reward = 0 avg_mse = 0
env = gym.make(args.env_name, reward_type='dense') else: env = gym.make(args.env_name) test_env = gym.make(args.env_name) args.cuda = True if torch.cuda.is_available() else False # Agent if args.gcp: obs_space = env.observation_space['desired_goal'].shape[0] + \ env.observation_space['observation'].shape[0] else: obs_space = env.observation_space.shape[0] args.automatic_entropy_tuning = True agent = SAC(obs_space, env.action_space, args) # Memory memory = ReplayMemory(args.replay_size) # Training Loop total_numsteps = 0 updates = 0 for i_episode in itertools.count(1): episode_reward = 0 episode_steps = 0 done = False state = env.reset() if args.gcp: goal = state['desired_goal']
def main(): """ The main file of the project """ # args and warnings ignoring setup simplefilter(action="ignore") parser = build_argparser() args = parser.parse_args() # environment setup env = NormalizedActions(gym.make( ENV_NAME)) # to ensure actions in [-1, 1] get correctly translated # setting libraries seeds to try and have repeatability torch.manual_seed(args.seed) np.random.seed(args.seed) env.seed(args.seed) # agent setup agent = SAC(env.observation_space, env.action_space, args) agent.load_networks_parameters(args.load_params) # if verbose, print a tabular recap of the args passed via command-line (or default ones) if args.verbose >= 1: t = Texttable() t.set_cols_dtype(['t', 'e']) t.add_rows([["Argument", "Value"]] + [[arg, getattr(args, arg)] for arg in vars(args)] + [["device", agent.device]]) print(t.draw()) print("\nSetup completed. Settings shown in the table above.") # training if args.train: input("\nPress any key to begin training.") try: train(env, agent, args) except KeyboardInterrupt: # to stop training print("\nInterrupt received.") except Exception: # if anything else happens, catch the exception and print it but without crashing traceback.print_exc() finally: print("\nTraining terminated.") # if required to save parameters, or need them for later testing, save them if args.save_params or args.test: global PARAMS_DIR PARAMS_DIR = agent.save_networks_parameters( args.save_params_dir) # save the plot that has been generated so far, if any if args.plot: save_plot() # close the environment env.close() # testing if args.test: try: # build environment and agent env = NormalizedActions(gym.make(ENV_NAME)) agent = SAC(env.observation_space, env.action_space, args) if PARAMS_DIR is None: # then look if the user has specified a directory for loading parameters if args.load_params is None: # then the agent will not load any parameters and will therefore act purely random print("WARNING: Testing a random agent.") else: PARAMS_DIR = args.load_params print("Using selected parameters.") else: print("Using training parameters.") # initialize agent's networks' parameters agent.load_networks_parameters(PARAMS_DIR) input("\nPress any key to begin testing.") test(env, agent, args) except KeyboardInterrupt: # to stop testing print("\nInterrupt received.") except Exception: # if anything else happens, catch the exception and print it but without crashing traceback.print_exc() finally: print("\nTesting terminated.") # save the plot that has been generated so far, if any if args.plot: save_plot() # close the environment env.close()
parser.add_argument('--cuda', action="store_true", help='run on CUDA (default: False)') args = parser.parse_args() # Environment # env = NormalizedActions(gym.make(args.env_name)) env = gym.make(args.env_name) env.seed(args.seed) env.action_space.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) # Agent agent = SAC(env.observation_space.shape[0], env.action_space, args) #Tesnorboard path = 'runs/{}_SAC_{}_{}_{}_seed_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name, args.policy, "autotune" if args.automatic_entropy_tuning else "", args.seed) writer = SummaryWriter(path) # Memory memory = ReplayMemory(args.replay_size, args.seed) # Training Loop total_numsteps = 0 updates = 0 for i_episode in itertools.count(0):
def runner(env, actor_path, critic_path, timesteps_per_batch, number_trajs, stochastic_policy, save=False, reuse=False, args=None, render=True): # Setup network # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space # initial network pi = SAC(ob_space.shape[0], ac_space, args) # pi = policy_func("pi", ob_space, ac_space, reuse=reuse) # U.initialize() # Prepare for rollouts load model # ---------------------------------------- pi.load_model(actor_path=actor_path, critic_path=critic_path) # u.load_variables(load_model_path) obs_list = [] obs1_list = [] acs_list = [] reward_list = [] done_list = [] episode_len_list = [] episode_return_list = [] current_traj_num = 0 current_abandon_traj_num = 0 while current_traj_num <= number_trajs: traj = traj_1_generator(pi, env, timesteps_per_batch, render, stochastic=stochastic_policy) if traj['ep_len'] < timesteps_per_batch: current_abandon_traj_num += 1 print("abandon episode number:{}!, episode len:{}".format( current_abandon_traj_num, traj['ep_len'])) continue obs, acs, ep_len, ep_ret = traj['ob'], traj['ac'], traj[ 'ep_len'], traj['ep_ret'] obs1, reward, done = traj['obs1'], traj['rew'], traj['new'] # append multi dimension array obs_list.append(obs) obs1_list.append(obs1) acs_list.append(acs) episode_len_list.append(ep_len) episode_return_list.append(ep_ret) done_list.append(done) reward_list.append(reward) current_traj_num += 1 if current_traj_num % 1 == 0: # control the print frequency print("accept episode number:{}, len:{}, returns:{}".format( current_traj_num, ep_len, ep_ret)) if current_traj_num >= number_trajs: break if stochastic_policy: print('stochastic policy:') else: print('deterministic policy:') if save: # Assemble the file name file_path = 'gather_expert_demonstration/expert_demonstration_data/model_guide/' file_name = 'stochastic' if stochastic_policy else 'deterministic' + '_SAC_' \ + env.spec.id + "(6000)" + '_johnny' path = osp.join(file_path, file_name) # Save the gathered data collections to the filesystem np.savez(path, obs=np.array(obs_list), acs=np.array(acs_list), lens=np.array(episode_len_list), returns=np.array(episode_return_list), done=np.array(done_list), reward=np.array(reward_list)) print("saving demonstrations") print(" @: {}.npz".format(path)) # save expert data for contrast algorithm sam # Assemble the file name file_path = 'gather_expert_demonstration/expert_demonstration_data/sam/' file_name = 'stochastic' if stochastic_policy else 'deterministic' + '_SAC_' \ + env.spec.id + "(6000)" + '_sam' path = osp.join(file_path, file_name) np.savez(path, obs0=np.array(obs_list), acs=np.array(acs_list), env_rews=np.array(reward_list), dones1=np.array(done_list), obs1=np.array(obs1_list), ep_lens=np.array(episode_len_list), ep_env_rets=np.array(episode_return_list)) print("saving demonstrations") print(" @: {}.npz".format(path)) avg_len = sum(episode_len_list) / len(episode_len_list) avg_ret = sum(episode_return_list) / len(episode_return_list) print("Average length:", avg_len) print("Average return:", avg_ret) return avg_len, avg_ret
def train_SAC(env_name, exp_name, seed, reparametrize, two_qf, old_funct, logdir, debug, gpu): alpha = { 'Ant-v2': 0.1, 'HalfCheetah-v2': 0.2, 'Hopper-v2': 0.2, 'Humanoid-v2': 0.05, 'Walker2d-v2': 0.2, }.get(env_name, 0.2) algorithm_params = { 'alpha': alpha, 'batch_size': 256, 'discount': 0.99, 'learning_rate': 1e-3, 'reparameterize': reparametrize, 'tau': 0.01, 'epoch_length': 1000, 'n_epochs': 500, 'two_qf': two_qf, } sampler_params = { 'max_episode_length': 1000, 'prefill_steps': 1000, } replay_pool_params = { 'max_size': 1e6, } value_function_params = { 'hidden_layer_sizes': (128, 128), } q_function_params = { 'hidden_layer_sizes': (128, 128), } policy_params = { 'hidden_layer_sizes': (128, 128), } logz.configure_output_dir(logdir) params = { 'exp_name': exp_name, 'env_name': env_name, 'algorithm_params': algorithm_params, 'sampler_params': sampler_params, 'replay_pool_params': replay_pool_params, 'value_function_params': value_function_params, 'q_function_params': q_function_params, 'policy_params': policy_params } logz.save_params(params) env = gym.envs.make(env_name) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) sampler = utils.SimpleSampler(**sampler_params) replay_pool = utils.SimpleReplayPool( observation_shape=env.observation_space.shape, action_shape=env.action_space.shape, **replay_pool_params) q_function = nn.QFunction(name='q_function', **q_function_params) if algorithm_params.get('two_qf', False): q_function2 = nn.QFunction(name='q_function2', **q_function_params) else: q_function2 = None value_function = nn.ValueFunction(name='value_function', **value_function_params) target_value_function = nn.ValueFunction(name='target_value_function', **value_function_params) policy = nn.GaussianPolicy( action_dim=env.action_space.shape[0], reparameterize=algorithm_params['reparameterize'], old_funct=old_funct, **policy_params) sampler.initialize(env, policy, replay_pool) algorithm = SAC(**algorithm_params) gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list=gpu) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options) with tf.Session(config=tf_config) as sess: if debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess) algorithm.build(env=env, policy=policy, q_function=q_function, q_function2=q_function2, value_function=value_function, target_value_function=target_value_function) for epoch in algorithm.train(sampler, session=sess, n_epochs=algorithm_params.get( 'n_epochs', 1000)): logz.log_tabular('Iteration', epoch) for k, v in algorithm.get_statistics().items(): logz.log_tabular(k, v) for k, v in replay_pool.get_statistics().items(): logz.log_tabular(k, v) for k, v in sampler.get_statistics().items(): logz.log_tabular(k, v) logz.dump_tabular()
args = parser.parse_args() wandb.init(name=f"{args.env_name}-HERLoaded", project="MyExp") # Environment env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) # # Agent if args.env_name.startswith('Fetch'): env_space = env.observation_space.spaces agent = SAC( env_space['observation'].shape[0] + env_space['desired_goal'].shape[0], env.action_space, args) else: agent = SAC(env.observation_space.shape[0] + 2, env.action_space, args) # Memory memory = ReplayGMemory(args.replay_size, args.seed) # Training Loop total_numsteps = 0 updates = 0 did_it = False for i_episode in itertools.count(1): episode_reward = 0 episode_steps = 0