def generate_expert_trajectorys(args): env = launch_env() env = ResizeWrapper(env) env = NormalizeWrapper(env) env = ImgWrapper(env) env = ActionWrapper(env) env = DtRewardWrapper(env) print("Initialized Wrappers") expert = PurePursuitExpert(env=env) for episode in range(0, args.episodes): print("Starting episode", episode) observations = [] actions = [] for steps in range(0, args.steps): # use our 'expert' to predict the next action. action = expert.predict(None) observation, reward, done, info = env.step(action) observations.append(observation) actions.append(action) # env.render() env.reset() torch.save(actions, '{}/data_a_{}.pt'.format(args.data_directory, episode)) torch.save(observations, '{}/data_o_{}.pt'.format(args.data_directory, episode)) env.close()
def _enjoy(): # Launch the env with our helper function env = launch_env() print("Initialized environment") # Wrappers env = ResizeWrapper(env) env = NormalizeWrapper(env) env = ImgWrapper(env) # to make the images from 160x120x3 into 3x160x120 env = ActionWrapper(env) env = DtRewardWrapper(env) print("Initialized Wrappers") state_dim = env.observation_space.shape action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy policy = DDPG(state_dim, action_dim, max_action, net_type="cnn") policy.load(filename='ddpg', directory='reinforcement/pytorch/models/') obs = env.reset() done = False while True: while not done: action = policy.predict(np.array(obs)) # Perform action obs, reward, done, _ = env.step(action) env.render() done = False obs = env.reset()
def _enjoy(args): from learning.utils.env import launch_env from learning.utils.wrappers import NormalizeWrapper, ImgWrapper, \ DtRewardWrapper, ActionWrapper, ResizeWrapper from learning.utils.teacher import PurePursuitExpert # model = Model(action_dim=2, max_action=1.) model = Generator(action_dim=2) try: # state_dict = torch.load('models/imitate.pt', map_location=device) state_dict = torch.load('models/G{}.pt'.format(args.enjoy_tag), map_location=device) model.load_state_dict(state_dict) except: print("Unexpected error:", sys.exc_info()[0]) print('failed to load model') exit() model.eval().to(device) env = launch_env() env = ResizeWrapper(env) env = NormalizeWrapper(env) env = ImgWrapper(env) env = ActionWrapper(env) env = DtRewardWrapper(env) obs = env.reset() # max_count = 0 while True: obs = torch.from_numpy(obs).float().to(device).unsqueeze(0) action = model(obs) action = action.squeeze().data.cpu().numpy() print("\nAction taken::", action, "\n") obs, reward, done, info = env.step(action) env.render() # if max_count > 50: # max_count = 0 # obs = env.reset() if done: if reward < 0: print('*** FAILED ***') time.sleep(0.7) # max_count += 1 obs = env.reset() env.render()
def _train(args): if not os.path.exists("./results"): os.makedirs("./results") if not os.path.exists(args.model_dir): os.makedirs(args.model_dir) # Launch the env with our helper function env = launch_env() print("Initialized environment") # Wrappers env = ResizeWrapper(env) env = NormalizeWrapper(env) env = ImgWrapper(env) # to make the images from 160x120x3 into 3x160x120 env = ActionWrapper(env) env = DtRewardWrapper(env) print("Initialized Wrappers") # Set seeds seed(args.seed) state_dim = env.observation_space.shape action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy policy = DDPG(state_dim, action_dim, max_action, net_type="cnn") replay_buffer = ReplayBuffer(args.replay_buffer_max_size) print("Initialized DDPG") # Evaluate untrained policy evaluations = [evaluate_policy(env, policy)] total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 done = True episode_reward = None env_counter = 0 reward = 0 print("Starting training") while total_timesteps < args.max_timesteps: print("timestep: {} | reward: {}".format(total_timesteps, reward)) if done: if total_timesteps != 0: print( ("Total T: %d Episode Num: %d Episode T: %d Reward: %f") % (total_timesteps, episode_num, episode_timesteps, episode_reward)) policy.train(replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau) # Evaluate episode if timesteps_since_eval >= args.eval_freq: timesteps_since_eval %= args.eval_freq evaluations.append(evaluate_policy(env, policy)) print("rewards at time {}: {}".format( total_timesteps, evaluations[-1])) if args.save_models: policy.save(file_name, directory=args.model_dir) np.savez("./results/{}.npz".format(file_name), evaluations) # Reset environment env_counter += 1 obs = env.reset() done = False episode_reward = 0 episode_timesteps = 0 episode_num += 1 # Select action randomly or according to policy if total_timesteps < args.start_timesteps: action = env.action_space.sample() else: action = policy.predict(np.array(obs)) if args.expl_noise != 0: action = (action + np.random.normal( 0, args.expl_noise, size=env.action_space.shape[0])).clip( env.action_space.low, env.action_space.high) # Perform action new_obs, reward, done, _ = env.step(action) if episode_timesteps >= args.env_timesteps: done = True done_bool = 0 if episode_timesteps + 1 == args.env_timesteps else float( done) episode_reward += reward # Store data in replay buffer replay_buffer.add(obs, new_obs, action, reward, done_bool) obs = new_obs episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 print("Training done, about to save..") policy.save(filename='ddpg', directory=args.model_dir) print("Finished saving..should return now!")
class Worker(mp.Process): def __init__(self, global_net, optimizer, args, info, identifier, logger): super(Worker, self).__init__() self.global_net = global_net self.optimizer = optimizer self.args = args self.info = info self.identifier = identifier self.name = f'worker-{identifier}' self.total_step = 0 self.ckpt_dir, self.ckpt_path, self.log_dir = logger.get_log_dirs() def calc_loss(self, args, values, log_probs, actions, rewards): np_values = values.view(-1).data.numpy() # Actor loss: Generalized Advantage Estimation A = R(lamdda) - V(s), Schulman # Paper: High-Dimensional Continuous Control Using Generalized Advantage Estimation delta_t = np.asarray( rewards) + args.gamma * np_values[1:] - np_values[:-1] advantage = discount(delta_t, args.gamma) # Select log probabilities of the actions the agent executed action_log_probabilities = log_probs.gather( 1, torch.tensor(actions).view(-1, 1)) policy_loss = -(action_log_probabilities.view(-1) * torch.FloatTensor(advantage.copy())).sum() # Critic loss: l2 loss over value estimator rewards[-1] += args.gamma * np_values[-1] discounted_reward = discount(np.asarray(rewards), args.gamma) discounted_reward = torch.tensor(discounted_reward.copy(), dtype=torch.float32) value_loss = .5 * (discounted_reward - values[:-1, 0]).pow(2).sum() # Entropy - Used for regularization # Entropy is a metric for the distribution of probabilities # -> We want to maximize entropy to encourage exploration entropy_loss = (-log_probs * torch.exp(log_probs)).sum() return policy_loss + 0.5 * value_loss - 0.01 * entropy_loss def run(self): from learning.utils.env import launch_env from learning.utils.wrappers import NormalizeWrapper, ImgWrapper, \ DtRewardWrapper, ActionWrapper, ResizeWrapper, DiscreteWrapper_a6 # We have to initialize the gym here, otherwise the multiprocessing will crash self.env = launch_env() # self.env = ResizeWrapper(self.env) # self.env = NormalizeWrapper(self.env) self.env = ImgWrapper( self.env) # to convert the images from 160x120x3 into 3x160x120 # self.env = ActionWrapper(self.env) self.env = DtRewardWrapper(self.env) self.env = DiscreteWrapper_a6(self.env) # Set seeds so we can reproduce our results self.env.seed(self.args.seed + self.identifier) torch.manual_seed(self.args.seed + self.identifier) self.local_net = Net(1, self.env.action_space.n) # local network state = torch.tensor(preprocess_state(self.env.reset())) # bookkeeping start_time = last_disp_time = time.time() episode_length, epr, eploss, done = 0, 0, 0, True render_this_episode = False while self.info['frames'][0] <= self.args.max_steps: render_this_episode = self.args.graphical_output and ( render_this_episode or (self.info['episodes'] % 10 == 0 and self.identifier == 0)) # Sync parameters from global net self.local_net.load_state_dict(self.global_net.state_dict()) # Reset hidden state of GRU cell / Remove hidden state from computational graph hx = torch.zeros(1, 256) if done else hx.detach() # Values used to compute gradients values, log_probs, actions, rewards = [], [], [], [] for step in range(self.args.steps_until_sync): episode_length += 1 # Inference value, logit, hx = self.local_net.forward( (state.view(-1, 1, 80, 80), hx)) action_log_probs = F.log_softmax(logit, dim=-1) # Sample an action from the distribution action = torch.exp(action_log_probs).multinomial( num_samples=1).data[0] np_action = action.numpy()[0] done = False for x in range(self.args.action_update_steps): if not done: state, reward, done, _ = self.env.step(np_action) reward += reward state = torch.tensor(preprocess_state(state)) epr += reward # reward = np.clip(reward, -1, 1) done = done or episode_length >= self.args.max_episode_steps if render_this_episode: self.env.render() self.info['frames'].add_(1) num_frames = int(self.info['frames'].item()) elapsed = time.time() - start_time if done: # Update statistics and save model frequently self.info['episodes'] += 1 # Moving average statistics: # Linear interpolation between the current average and the new value # Allows us to better estimate quality of results with high variance interp_factor = 1 if self.info['episodes'][ 0] == 1 else 1 - 0.99 self.info['run_epr'].mul_(1 - interp_factor).add_( interp_factor * epr) self.info['run_loss'].mul_(1 - interp_factor).add_( interp_factor * eploss) # Save model every 100_000 episodes if self.args.save_models and self.info['episodes'][ 0] % self.args.save_frequency == 0: with open( f"{self.log_dir}/performance-{self.name}.txt", "a") as myfile: myfile.write( f"{self.info['episodes'].item():.0f} {num_frames} {epr}" + f"{self.info['run_loss'].item()} {elapsed}\n") torch.save( { 'model_state_dict': self.global_net.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'info': self.info }, f"{self.ckpt_dir}/model-{self.name}-{int(self.info['episodes'].item())}.pth" ) print( "Saved model to:", f"{self.ckpt_dir}/model-{self.name}-{self.info['episodes'].item()}" ) # print training info every minute if self.identifier == 0 and time.time() - last_disp_time > 60: elapsed = time.strftime( "%Hh %Mm %Ss", time.gmtime(time.time() - start_time)) print( f"[time]: {elapsed}, [episodes]: {self.info['episodes'].item(): .0f}," + f" [frames]: {num_frames: .0f}, [mean epr]:{self.info['run_epr'].item():.2f}," + f" [run loss]: {self.info['run_loss'].item(): .2f}") last_disp_time = time.time() # reset buffers / environment if done: episode_length, epr, eploss = 0, 0, 0 state = torch.tensor(preprocess_state(self.env.reset())) values.append(value) log_probs.append(action_log_probs) actions.append(action) rewards.append(reward) # Reached sync step -> We need a terminal value # If the episode did not end use estimation of V(s) to bootstrap next_value = torch.zeros(1, 1) if done else self.local_net.forward( (state.unsqueeze(0), hx))[0] values.append(next_value.detach()) # Calculate loss loss = self.calc_loss(self.args, torch.cat(values), torch.cat(log_probs), torch.cat(actions), np.asarray(rewards)) eploss += loss.item() # Calculate gradient self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.local_net.parameters(), 40) # sync gradients with global network for param, shared_param in zip(self.local_net.parameters(), self.global_net.parameters()): if shared_param.grad is None: shared_param._grad = param.grad # Backpropagation self.optimizer.step()
def run(self): from learning.utils.env import launch_env from learning.utils.wrappers import NormalizeWrapper, ImgWrapper, \ DtRewardWrapper, ActionWrapper, ResizeWrapper, DiscreteWrapper_a6 # We have to initialize the gym here, otherwise the multiprocessing will crash self.env = launch_env() # self.env = ResizeWrapper(self.env) # self.env = NormalizeWrapper(self.env) self.env = ImgWrapper( self.env) # to convert the images from 160x120x3 into 3x160x120 # self.env = ActionWrapper(self.env) self.env = DtRewardWrapper(self.env) self.env = DiscreteWrapper_a6(self.env) # Set seeds so we can reproduce our results self.env.seed(self.args.seed + self.identifier) torch.manual_seed(self.args.seed + self.identifier) self.local_net = Net(1, self.env.action_space.n) # local network state = torch.tensor(preprocess_state(self.env.reset())) # bookkeeping start_time = last_disp_time = time.time() episode_length, epr, eploss, done = 0, 0, 0, True render_this_episode = False while self.info['frames'][0] <= self.args.max_steps: render_this_episode = self.args.graphical_output and ( render_this_episode or (self.info['episodes'] % 10 == 0 and self.identifier == 0)) # Sync parameters from global net self.local_net.load_state_dict(self.global_net.state_dict()) # Reset hidden state of GRU cell / Remove hidden state from computational graph hx = torch.zeros(1, 256) if done else hx.detach() # Values used to compute gradients values, log_probs, actions, rewards = [], [], [], [] for step in range(self.args.steps_until_sync): episode_length += 1 # Inference value, logit, hx = self.local_net.forward( (state.view(-1, 1, 80, 80), hx)) action_log_probs = F.log_softmax(logit, dim=-1) # Sample an action from the distribution action = torch.exp(action_log_probs).multinomial( num_samples=1).data[0] np_action = action.numpy()[0] done = False for x in range(self.args.action_update_steps): if not done: state, reward, done, _ = self.env.step(np_action) reward += reward state = torch.tensor(preprocess_state(state)) epr += reward # reward = np.clip(reward, -1, 1) done = done or episode_length >= self.args.max_episode_steps if render_this_episode: self.env.render() self.info['frames'].add_(1) num_frames = int(self.info['frames'].item()) elapsed = time.time() - start_time if done: # Update statistics and save model frequently self.info['episodes'] += 1 # Moving average statistics: # Linear interpolation between the current average and the new value # Allows us to better estimate quality of results with high variance interp_factor = 1 if self.info['episodes'][ 0] == 1 else 1 - 0.99 self.info['run_epr'].mul_(1 - interp_factor).add_( interp_factor * epr) self.info['run_loss'].mul_(1 - interp_factor).add_( interp_factor * eploss) # Save model every 100_000 episodes if self.args.save_models and self.info['episodes'][ 0] % self.args.save_frequency == 0: with open( f"{self.log_dir}/performance-{self.name}.txt", "a") as myfile: myfile.write( f"{self.info['episodes'].item():.0f} {num_frames} {epr}" + f"{self.info['run_loss'].item()} {elapsed}\n") torch.save( { 'model_state_dict': self.global_net.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'info': self.info }, f"{self.ckpt_dir}/model-{self.name}-{int(self.info['episodes'].item())}.pth" ) print( "Saved model to:", f"{self.ckpt_dir}/model-{self.name}-{self.info['episodes'].item()}" ) # print training info every minute if self.identifier == 0 and time.time() - last_disp_time > 60: elapsed = time.strftime( "%Hh %Mm %Ss", time.gmtime(time.time() - start_time)) print( f"[time]: {elapsed}, [episodes]: {self.info['episodes'].item(): .0f}," + f" [frames]: {num_frames: .0f}, [mean epr]:{self.info['run_epr'].item():.2f}," + f" [run loss]: {self.info['run_loss'].item(): .2f}") last_disp_time = time.time() # reset buffers / environment if done: episode_length, epr, eploss = 0, 0, 0 state = torch.tensor(preprocess_state(self.env.reset())) values.append(value) log_probs.append(action_log_probs) actions.append(action) rewards.append(reward) # Reached sync step -> We need a terminal value # If the episode did not end use estimation of V(s) to bootstrap next_value = torch.zeros(1, 1) if done else self.local_net.forward( (state.unsqueeze(0), hx))[0] values.append(next_value.detach()) # Calculate loss loss = self.calc_loss(self.args, torch.cat(values), torch.cat(log_probs), torch.cat(actions), np.asarray(rewards)) eploss += loss.item() # Calculate gradient self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.local_net.parameters(), 40) # sync gradients with global network for param, shared_param in zip(self.local_net.parameters(), self.global_net.parameters()): if shared_param.grad is None: shared_param._grad = param.grad # Backpropagation self.optimizer.step()
def _main(args): ############## Hyperparameters ############## # env_name = "BipedalWalker-v2" env_name = 'Duckietown-loop_empty-v0' render = False lr = 0.0003 # parameters for Adam optimizer betas = (0.9, 0.999) random_seed = None print(args) ############################################# # creating environment env = launch_env() # Wrappers env = ResizeWrapper(env) env = NormalizeWrapper(env) env = ImgWrapper(env) # to make the images from 160x120x3 into 3x160x120 env = ActionWrapper(env) env = DtRewardWrapper(env) print("Initialized Wrappers") # state_dim = env.observation_space.shape[0] state_dim = env.observation_space.shape state_dim = functools.reduce(operator.mul, state_dim, 1) action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) if random_seed: print("Random Seed: {}".format(random_seed)) torch.manual_seed(random_seed) env.seed(random_seed) np.random.seed(random_seed) memory = Memory() ppo = PPO(state_dim, action_dim, args.action_std, lr, betas, args.gamma, args.K_epochs, args.eps_clip, max_action, args.batch_size) print(lr, betas) # logging variables running_reward = 0 avg_length = 0 time_step = 0 episode_reward = 0 # stats = pd.DataFrame(columns = ["Episode", "Length", "Reward"]) stats = [] with open("PPO_stats.csv", 'w') as statsfile: statsfile.write("Epoch, Timesteps, Reward\n") # training loop for i_episode in range(1, args.max_episodes + 1): state = env.reset() for t in range(args.max_timesteps): time_step += 1 # Running policy_old: action = ppo.select_action(state, memory) state, reward, done, _ = env.step(action) # Saving reward and is_terminals: memory.rewards.append(reward) memory.is_terminals.append(done) # update if its time if time_step % args.update_timestep == 0: ppo.update(memory) memory.clear_memory() time_step = 0 episode_reward += reward if render: env.render() if done: break avg_length += t # stats = stats.append({"Episode" : i_episode, "Length" : t, "Reward" : episode_reward}, ignore_index=True) stats.append((i_episode, t, episode_reward)) running_reward += episode_reward episode_reward = 0 if i_episode % args.store_interval == 0: torch.save(ppo.policy.state_dict(), './PPO_continuous_{}.pth'.format(env_name)) # stats.to_csv("PPO_stats.csv", index=False) #This line does not work on Google Colab! with open("PPO_stats.csv", 'a') as statsfile: for eps, ts, rwd in stats: statsfile.write("%d, %d, %f\n" % (eps, ts, rwd)) stats = [] # logging if i_episode % args.log_interval == 0: avg_length = int(avg_length / args.log_interval) running_reward = int((running_reward / args.log_interval)) print('Episode {} \t Avg length: {} \t Avg reward: {}'.format( i_episode, avg_length, running_reward)) running_reward = 0 avg_length = 0
def test(): ############## Hyperparameters ############## # env_name = "BipedalWalker-v2" env_name = 'Duckietown-loop_empty-v0' # env = gym.make(env_name) # creating environment env = launch_env() # Wrappers env = ResizeWrapper(env) env = NormalizeWrapper(env) env = ImgWrapper(env) # to make the images from 160x120x3 into 3x160x120 env = ActionWrapper(env) env = DtRewardWrapper(env) print("Initialized Wrappers") # state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] state_dim = env.observation_space.shape state_dim = functools.reduce(operator.mul, state_dim, 1) max_action = float(env.action_space.high[0]) n_episodes = 3 # num of episodes to run max_timesteps = 500 # max timesteps in one episode render = True # render the environment save_gif = False # png images are saved in gif folder # filename and directory to load model from filename = "PPO_continuous_" + env_name + ".pth" directory = "./preTrained/" action_std = 0.05 # constant std for action distribution (Multivariate Normal) K_epochs = 80 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO gamma = 0.99 # discount factor lr = 0.0003 # parameters for Adam optimizer betas = (0.9, 0.999) ############################################# memory = Memory() ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip, max_action, 32) ppo.policy_old.load_state_dict( torch.load(directory + filename, map_location=torch.device('cpu'))) for ep in range(1, n_episodes + 1): ep_reward = 0 state = env.reset() for t in range(max_timesteps): action = ppo.select_action(state, memory) state, reward, done, _ = env.step(action) ep_reward += reward if render: env.render() if save_gif: img = env.render(mode='rgb_array') img = Image.fromarray(img) img.save('./gif/{}.jpg'.format(t)) if done: break print('Episode: {}\tReward: {}'.format(ep, int(ep_reward))) ep_reward = 0 env.close()
def _train(args): print("Running Expert for {} Episodes of {} Steps".format( args.episodes, args.steps)) print("Training Learning for {} Epochs with Batch Size of {}".format( args.epochs, args.batch_size)) env = launch_env() env = ResizeWrapper(env) env = NormalizeWrapper(env) env = ActionWrapper(env) env = DtRewardWrapper(env) print("Initialized Wrappers") observation_shape = (None, ) + env.observation_space.shape action_shape = (None, ) + env.action_space.shape # Create an imperfect demonstrator expert = PurePursuitExpert(env=env) observations = [] actions = [] # let's collect our samples for episode in range(0, args.episodes): print("Starting episode", episode) for steps in range(0, args.steps): # use our 'expert' to predict the next action. action = expert.predict(None) observation, reward, done, info = env.step(action) observations.append(observation) actions.append(action) env.reset() env.close() actions = np.array(actions) observations = np.array(observations) model = TensorflowModel( observation_shape=observation_shape, # from the logs we've got action_shape=action_shape, # same graph_location=args. model_directory, # where do we want to store our trained models seed=args. seed # to seed all random operations in the model (e.g., dropout) ) for i in range(args.epochs): # we defined the batch size, this can be adjusted according to your computing resources loss = None for batch in range(0, len(observations), args.batch_size): print("Training batch", batch) loss = model.train(observations=observations[batch:batch + args.batch_size], actions=actions[batch:batch + args.batch_size]) # every 10 epochs, we store the model we have if i % 10 == 0: model.commit() print("Training complete!")
def _train(args): # Ensure that multiprocessing works properly without deadlock... if sys.version_info[0] > 2: mp.set_start_method('spawn') env = launch_env() # env = ResizeWrapper(env) # env = NormalizeWrapper(env) env = ImgWrapper(env) # to make the images from 160x120x3 into 3x160x120 # env = ActionWrapper(env) env = DtRewardWrapper(env) env = DiscreteWrapper_a6(env) # Set seeds seed(args.seed) logger = Logger("models") ckpt_dir, ckpt_path, log_dir = logger.get_log_dirs() shape_obs_space = env.observation_space.shape # (3, 120, 160) shape_action_space = env.action_space.n # 3 print("Initializing Global Network") global_net = a3c.Net(channels=1, num_actions=shape_action_space) global_net.share_memory() # share the global parameters in multiprocessing optimizer = CustomOptimizer.SharedAdam(global_net.parameters(), lr=args.learning_rate) info = { k: torch.DoubleTensor([0]).share_memory_() for k in ['run_epr', 'run_loss', 'episodes', 'frames'] } if args.model_file is not None: cwd = os.getcwd() filepath = os.path.join(cwd, args.model_dir, args.model_file) checkpoint = torch.load(filepath) global_net.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) info = checkpoint['info'] print('Loaded model:', args.model_file) print("Instantiating %i workers" % args.num_workers) workers = [ a3c.Worker(global_net, optimizer, args, info, identifier=i, logger=logger) for i in range(args.num_workers) ] print("Start training...") interrupted = False for w in workers: w.daemon = True w.start() try: [w.join() for w in workers] except KeyboardInterrupt: [w.terminate() for w in workers] interrupted = True if not interrupted or args.save_on_interrupt: print("Finished training.") if args.save_models: path = os.path.join(ckpt_dir, 'model-final.pth') torch.save( { 'model_state_dict': global_net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'info': info }, path) print("Saved model to:", f"{ckpt_dir}/model-final")
def _train(args): env = launch_env() env = ResizeWrapper(env) env = NormalizeWrapper(env) env = ImgWrapper(env) env = ActionWrapper(env) env = DtRewardWrapper(env) print("Initialized Wrappers") observation_shape = (None, ) + env.observation_space.shape action_shape = (None, ) + env.action_space.shape # Create an imperfect demonstrator expert = PurePursuitExpert(env=env) observations = [] actions = [] # let's collect our samples for episode in range(0, args.episodes): print("Starting episode", episode) for steps in range(0, args.steps): # use our 'expert' to predict the next action. action = expert.predict(None) observation, reward, done, info = env.step(action) observations.append(observation) actions.append(action) env.reset() env.close() actions = np.array(actions) observations = np.array(observations) # model = Model(action_dim=2, max_action=1.) model = Generator(action_dim=2) # state_dict = torch.load('models/G_imitate_2.pt', map_location=device) # model.load_state_dict(state_dict) model.train().to(device) # weight_decay is L2 regularization, helps avoid overfitting optimizer = optim.SGD(model.parameters(), lr=0.0004, weight_decay=1e-3) avg_loss = 0 for epoch in range(args.epochs): optimizer.zero_grad() batch_indices = np.random.randint(0, observations.shape[0], (args.batch_size)) obs_batch = torch.from_numpy( observations[batch_indices]).float().to(device) act_batch = torch.from_numpy(actions[batch_indices]).float().to(device) model_actions = model(obs_batch) loss = (model_actions - act_batch).norm(2).mean() loss.backward() optimizer.step() loss = loss.data.item() avg_loss = avg_loss * 0.995 + loss * 0.005 print('epoch %d, loss=%.3f' % (epoch, avg_loss)) # Periodically save the trained model if epoch % 200 == 0: torch.save(model.state_dict(), '{}/G_imitate.pt'.format(args.model_directory))