def make_env(seed): env = gym.make('CarRacing-v0') env = NormalizeRGB(env) env = CropCarRacing(env) env = ResizeObservation(env, (64, 64, 3)) env.seed(seed) np.random.seed(seed) return env
def main(): # Parse arguments parser = argparse.ArgumentParser(description='REINFORCE using PyTorch') parser.add_argument('--gamma', type=float, default=0.99, help='discount factor (default: 0.99)') parser.add_argument('--lr', type=float, default=0.01, help='learning rate (default: 0.01)') parser.add_argument('--eb', type=int, default=1, help='episode batch (default: 1)') parser.add_argument('--episodes', type=int, default=10000, help='simulated episodes (default: 10000)') parser.add_argument('--policy', type=str, default=None, help="""Policy checkpoint to restore.""") parser.add_argument('--seed', type=int, default=42, help='random seed (default: 42)') parser.add_argument('--horizon', type=int, default=1000, help='horizon (default: 1000)') parser.add_argument('--render', action='store_true', help='render the environment') parser.add_argument('--baseline', action='store_true', help='use the baseline for the REINFORCE algorithm') parser.add_argument('--render_interval', type=int, default=100, help='interval between rendered epochs (default: 100)') parser.add_argument('--env', type=str, default='CarRacing-v0', help='environment to train on (default: CartPole-v0)') parser.add_argument('--vae', type=str, default=None, help='VAE checkpoint to load') parser.add_argument('--arch', type=str, default='base_car_racing', help="""Model architecture.""") args = parser.parse_args() # Initialize environment env = gym.make(args.env) env = CropCarRacing(env) env = ResizeObservation(env, (32, 32, 3)) env = Scolorized(env, weights=[0.0, 1.0, 0.0]) env = NormalizeRGB(env) env = VAEObservation(env, args.vae, arch=args.arch) print(env.observation_space) env.seed(args.seed) torch.manual_seed(args.seed) print("Env final goal:", env.spec.reward_threshold) # Create the alias for the run alias = 'reinforce_lr=%s_eb=%s_seed=%s' % (args.lr, args.eb, args.seed) if args.baseline: alias += '_baseline' alias += '_%s' % (time.time()) # Use alias for checkpoints checkpoint_best_filename = 'policy_weights/' + alias + '_best.torch' checkpoint_final_filename = 'policy_weights/' + alias + '_final.torch' if not os.path.exists('policy_weights/'): os.makedirs('policy_weights/') # Tensorboard writer writer = SummaryWriter('policy_logs/' + alias) # Declare policy policy = Policy(env) if args.policy: policy.load_state_dict(torch.load(args.policy)) policy.eval() # Declare sampler sampler = Sampler(env, args.horizon) # Run episodes running_reward = deque(maxlen=100) best_reward = None for i_episode in trange(0, args.episodes, args.eb, desc="Episodes", unit_scale=args.eb): # Sample trajectories trajectories = sampler.sample(args.eb, policy, render=(i_episode % args.render_interval == 0)) # Update policy finish_episode(trajectories, policy, args) # Get quantities for summaries episode_rewards = np.sum(trajectories['rewards'], axis=1) mean_reward = np.mean(episode_rewards) episode_lens = np.sum(trajectories['mask'], axis=1) for sub_i in range(args.eb): # Summaries: mean episode reward for 100 episodes running_reward.append(episode_rewards[sub_i]) writer.add_scalar('data/mean_100episode_reward', np.mean(running_reward), i_episode + sub_i) # Summaries: mean episode len writer.add_scalar('data/episode_len', episode_lens[sub_i], i_episode + sub_i) writer.add_scalar('data/episode_reward', episode_rewards[sub_i], i_episode + sub_i) # Save best model if needed if (best_reward is None) or (mean_reward > best_reward): best_reward = mean_reward print("Saving best model:", best_reward) torch.save(policy.state_dict(), checkpoint_best_filename) # Check if completed if np.mean(running_reward) > env.spec.reward_threshold: print("Solved, stopping. Mean reward:", np.mean(running_reward)) break # Save final model torch.save(policy.state_dict(), checkpoint_final_filename) # Close env and writer env.close() writer.close()
def main(): # Parse arguments parser = argparse.ArgumentParser(description='REINFORCE using PyTorch') # Logging parser.add_argument('--alias', type=str, default='base', help="""Alias of the model.""") parser.add_argument('--render_interval', type=int, default=100, help='interval between rendered epochs (default: 100)') # Learning parameters parser.add_argument('--gamma', type=float, default=0.99, help='discount factor (default: 0.99)') parser.add_argument('--lr', type=float, default=0.01, help='learning rate (default: 0.01)') parser.add_argument('--no-cuda', action='store_true', default=False, help='Enables CUDA training') parser.add_argument('--eb', type=int, default=1, help='episode batch (default: 1)') parser.add_argument('--episodes', type=int, default=10000, help='simulated episodes (default: 10000)') parser.add_argument('--policy', type=str, default=None, help="""Policy checkpoint to restore.""") parser.add_argument('--seed', type=int, default=42, help='random seed (default: 42)') parser.add_argument('--horizon', type=int, default=1000, help='horizon (default: 1000)') parser.add_argument('--baseline', action='store_true', help='use the baseline for the REINFORCE algorithm') args = parser.parse_args() # Check cuda args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if args.cuda else "cpu") # Initialize environment env = gym.make('CarRacing-v0') env = CropCarRacing(env) env = ResizeObservation(env, (32, 32, 3)) env = Scolorized(env, weights=[0.0, 1.0, 0.0]) env = NormalizeRGB(env) env.seed(args.seed) torch.manual_seed(args.seed) print("Env final goal:", env.spec.reward_threshold) # Create the alias for the run alias = '%s_%s' % (args.alias, time.time()) # Use alias for checkpoints checkpoint_best_filename = 'policy_weights/' + alias + '_best.torch' checkpoint_final_filename = 'policy_weights/' + alias + '_final.torch' if not os.path.exists('weights/'): os.makedirs('weights/') # Tensorboard writer writer = SummaryWriter('logs/' + alias) # Create VAE policy vape = VAEPolicy() optimizer = optim.Adam(vape.parameters(), lr=1e-04) # Animation of environment obs = env.reset() obs_torch = torch.from_numpy(NCHW([obs])).float().to(device) rebuild = vape.encode_decode(obs_torch) rebuild = NHWC(rebuild.detach().numpy()[0]) fig1 = plt.figure() if len(obs.shape) == 3 and (obs.shape[-1] == 1): im = plt.imshow(side_by_side(obs, rebuild), cmap="Greys") else: im = plt.imshow(side_by_side(obs, rebuild)) done = False HORIZON = 200 timestep = 0 # Setting animation update function def updatefig(*args): nonlocal done nonlocal obs nonlocal HORIZON nonlocal timestep obs_torch = torch.from_numpy(NCHW([obs])).float().to(device) if not done and timestep < HORIZON: action, action_proba = vape.act(obs_torch) action = action[0].detach().numpy() obs, reward, done, info = env.step(action) env.render(mode='human') timestep += 1 else: done = False obs = env.reset() timestep = 0 rebuild = vape.encode_decode(obs_torch) rebuild = NHWC(rebuild.detach().numpy()[0]) im.set_array(side_by_side(obs, rebuild)) vape.optimize_vae(obs_torch, optimizer) time.sleep(0.01) return im, # Start animation ani = animation.FuncAnimation(fig1, updatefig, interval=50, blit=True) plt.show() # Close env and writer env.close() writer.close()
args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if args.cuda else "cpu") # Loading the dataset if args.dataset: dataset = np.array(pickle.load(open(args.dataset, 'rb'))) N_SAMPLES, W, H, CHANNELS = dataset.shape print("Dataset size:", N_SAMPLES) print("Channels:", CHANNELS) print("Image dim: (%d,%d)" % (W,H)) dataset_torch = torch.from_numpy(NCHW(dataset)).float().to(device) else: print("Using gym environment directly.") env = gym.make('CarRacing-v0') env = CropCarRacing(env) env = ResizeObservation(env, (32, 32, 3)) env = NormalizeRGB(env) env = Scolorized(env, weights=[0.0, 1.0, 0.0]) env.seed(args.seed) # Network creation VAE_class = VAEbyArch(args.arch) vae = VAE_class(latent_size=args.latent_size).to(device) # Restore checkpoint assert args.vae, "No checkpoint provided." vae.load_state_dict(torch.load(args.vae)) vae.eval() if args.dataset: # Single observation display mu, log_sigma, z, rebuild = vae(dataset_torch[args.sample:args.sample+1]) rebuild = rebuild.detach().numpy()[0]
def main(): # Parse arguments parser = argparse.ArgumentParser(description='REINFORCE using PyTorch') parser.add_argument('--gamma', type=float, default=0.99, help='discount factor (default: 0.99)') parser.add_argument('--lr', type=float, default=0.01, help='learning rate (default: 0.01)') parser.add_argument('--vae_lr', type=float, default=1e-04, help='learning rate (default: 0.01)') parser.add_argument('--eb', type=int, default=1, help='episode batch (default: 1)') parser.add_argument('--episodes', type=int, default=10000, help='simulated episodes (default: 10000)') parser.add_argument('--controller', type=str, default=None, help="""Controller checkpoint to restore.""") parser.add_argument('--seed', type=int, default=42, help='random seed (default: 42)') parser.add_argument('--horizon', type=int, default=1000, help='horizon (default: 1000)') parser.add_argument('--render', action='store_true', help='render the environment') parser.add_argument('--baseline', action='store_true', help='use the baseline for the REINFORCE algorithm') parser.add_argument('--render_interval', type=int, default=100, help='interval between rendered epochs (default: 100)') parser.add_argument('--avoidance', type=str, default='self', help='Avoidance scheme') parser.add_argument('--dist', type=str, default='beta', help='Action probability distribution.') parser.add_argument('--avoidance_max', type=float, default=1.0, help='Avoidance max value') args = parser.parse_args() # Initialize environment env = gym.make('CarRacing-v0') env = CropCarRacing(env) env = ResizeObservation(env, (64, 64, 3)) env = Scolorized(env, weights=[0.0, 1.0, 0.0]) env = NormalizeRGB(env) #env = ActionScaler(env) env.seed(args.seed) torch.manual_seed(args.seed) print("Env final goal:", env.spec.reward_threshold) # Create the alias for the run alias = 'reinforce_lr=%s_eb=%s_seed=%s' % (args.lr, args.eb, args.seed) if args.baseline: alias += '_baseline' alias += '_%s' % (time.time()) # Use alias for checkpoints checkpoint_best_filename = 'weights/' + alias + '_best.torch' checkpoint_final_filename = 'weights/' + alias + '_final.torch' if not os.path.exists('weights/'): os.makedirs('weights/') # Tensorboard writer writer = SummaryWriter('logs/' + alias) # Declare vae policy vape = VAEPolicy(avoidance=args.avoidance, avoidance_threshold=args.avoidance_max, vae_lr=args.vae_lr) if args.controller: vape.load_state_dict(torch.load(args.controller)) # Declare sampler sampler = Sampler(env, args.horizon) # Run episodes running_reward = deque(maxlen=100) best_reward = None for i_episode in trange(0, args.episodes, args.eb, desc="Episodes", unit_scale=args.eb): # Sample trajectories trajectories, losses_and_info = sampler.sample(args.eb, vape, render=False)#(i_episode%args.render_interval==0)) reco_loss, norm_loss, total_loss, added_to_batch, avoidance_score = zip(*losses_and_info) # Update policy finish_episode(trajectories, vape, args) # Get quantities for summaries episode_rewards = np.sum(trajectories['rewards'], axis=1) mean_reward = np.mean(episode_rewards) episode_lens = np.sum(trajectories['mask'], axis=1) for sub_i in range(args.eb): # Summaries: mean episode reward for 100 episodes running_reward.append(episode_rewards[sub_i]) writer.add_scalar('data/mean_100episode_reward', np.mean(running_reward), i_episode + sub_i) # Summaries: mean episode len writer.add_scalar('data/episode_len', episode_lens[sub_i], i_episode + sub_i) writer.add_scalar('data/episode_reward', episode_rewards[sub_i], i_episode + sub_i) writer.add_scalar('data/added_to_batch', np.sum(added_to_batch), i_episode/args.eb) writer.add_scalar('data/mean_avoidance', np.mean(avoidance_score), i_episode/args.eb) writer.add_scalar('data/reco_loss', np.mean(reco_loss), i_episode/args.eb) writer.add_scalar('data/norm_loss', np.mean(norm_loss), i_episode/args.eb) # Save best model if needed if (best_reward is None) or (mean_reward > best_reward): best_reward = mean_reward print("Saving best model:", best_reward) torch.save(vape.state_dict(), checkpoint_best_filename) # Check if completed if np.mean(running_reward) > env.spec.reward_threshold: print("Solved, stopping. Mean reward:", np.mean(running_reward)) break # Save final model torch.save(vape.state_dict(), checkpoint_final_filename) # Close env and writer env.close() writer.close()