def make_env(seed): env = gym.make('CarRacing-v0') env = NormalizeRGB(env) env = CropCarRacing(env) env = ResizeObservation(env, (64, 64, 3)) env.seed(seed) np.random.seed(seed) return env
def run_agent(layout: str): env = PacmanEnv(layout) env = SkipFrame(env, skip=4) env = GrayScaleObservation(env) env = ResizeObservation(env, shape=84) env = FrameStack(env, num_stack=4) screen = env.reset(mode='rgb_array') n_actions = env.action_space.n model = load_model(screen.shape, n_actions, 'pacman.pth') for i in range(10): env.render(mode='human') screen = env.reset(mode='rgb_array') for _ in count(): env.render(mode='human') action = select_action(screen, 0, model, n_actions) screen, reward, done, info = env.step(action) if done: break
def main(): # Parse arguments parser = argparse.ArgumentParser(description='REINFORCE using PyTorch') parser.add_argument('--gamma', type=float, default=0.99, help='discount factor (default: 0.99)') parser.add_argument('--lr', type=float, default=0.01, help='learning rate (default: 0.01)') parser.add_argument('--eb', type=int, default=1, help='episode batch (default: 1)') parser.add_argument('--episodes', type=int, default=10000, help='simulated episodes (default: 10000)') parser.add_argument('--policy', type=str, default=None, help="""Policy checkpoint to restore.""") parser.add_argument('--seed', type=int, default=42, help='random seed (default: 42)') parser.add_argument('--horizon', type=int, default=1000, help='horizon (default: 1000)') parser.add_argument('--render', action='store_true', help='render the environment') parser.add_argument('--baseline', action='store_true', help='use the baseline for the REINFORCE algorithm') parser.add_argument('--render_interval', type=int, default=100, help='interval between rendered epochs (default: 100)') parser.add_argument('--env', type=str, default='CarRacing-v0', help='environment to train on (default: CartPole-v0)') parser.add_argument('--vae', type=str, default=None, help='VAE checkpoint to load') parser.add_argument('--arch', type=str, default='base_car_racing', help="""Model architecture.""") args = parser.parse_args() # Initialize environment env = gym.make(args.env) env = CropCarRacing(env) env = ResizeObservation(env, (32, 32, 3)) env = Scolorized(env, weights=[0.0, 1.0, 0.0]) env = NormalizeRGB(env) env = VAEObservation(env, args.vae, arch=args.arch) print(env.observation_space) env.seed(args.seed) torch.manual_seed(args.seed) print("Env final goal:", env.spec.reward_threshold) # Create the alias for the run alias = 'reinforce_lr=%s_eb=%s_seed=%s' % (args.lr, args.eb, args.seed) if args.baseline: alias += '_baseline' alias += '_%s' % (time.time()) # Use alias for checkpoints checkpoint_best_filename = 'policy_weights/' + alias + '_best.torch' checkpoint_final_filename = 'policy_weights/' + alias + '_final.torch' if not os.path.exists('policy_weights/'): os.makedirs('policy_weights/') # Tensorboard writer writer = SummaryWriter('policy_logs/' + alias) # Declare policy policy = Policy(env) if args.policy: policy.load_state_dict(torch.load(args.policy)) policy.eval() # Declare sampler sampler = Sampler(env, args.horizon) # Run episodes running_reward = deque(maxlen=100) best_reward = None for i_episode in trange(0, args.episodes, args.eb, desc="Episodes", unit_scale=args.eb): # Sample trajectories trajectories = sampler.sample(args.eb, policy, render=(i_episode % args.render_interval == 0)) # Update policy finish_episode(trajectories, policy, args) # Get quantities for summaries episode_rewards = np.sum(trajectories['rewards'], axis=1) mean_reward = np.mean(episode_rewards) episode_lens = np.sum(trajectories['mask'], axis=1) for sub_i in range(args.eb): # Summaries: mean episode reward for 100 episodes running_reward.append(episode_rewards[sub_i]) writer.add_scalar('data/mean_100episode_reward', np.mean(running_reward), i_episode + sub_i) # Summaries: mean episode len writer.add_scalar('data/episode_len', episode_lens[sub_i], i_episode + sub_i) writer.add_scalar('data/episode_reward', episode_rewards[sub_i], i_episode + sub_i) # Save best model if needed if (best_reward is None) or (mean_reward > best_reward): best_reward = mean_reward print("Saving best model:", best_reward) torch.save(policy.state_dict(), checkpoint_best_filename) # Check if completed if np.mean(running_reward) > env.spec.reward_threshold: print("Solved, stopping. Mean reward:", np.mean(running_reward)) break # Save final model torch.save(policy.state_dict(), checkpoint_final_filename) # Close env and writer env.close() writer.close()
from metrics import MetricLogger from agent import Mario from wrappers import ResizeObservation, SkipFrame # Initialize Super Mario environment env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') # Limit the action-space to # 0. walk right # 1. jump right env = JoypadSpace(env, [['right'], ['right', 'A']]) # Apply Wrappers to environment env = SkipFrame(env, skip=4) env = GrayScaleObservation(env, keep_dim=False) env = ResizeObservation(env, shape=84) env = TransformObservation(env, f=lambda x: x / 255.) env = FrameStack(env, num_stack=4) env.reset() save_dir = Path('checkpoints') / datetime.datetime.now().strftime( '%Y-%m-%dT%H-%M-%S') save_dir.mkdir(parents=True) checkpoint = None # Path('checkpoints/2020-10-21T18-25-27/mario.chkpt') # Add in check to see if GPU is avaliable (BM) if torch.cuda.is_available(): device = torch.device("cuda:0") print("Using GPU!")
def train_agent(layout: str, episodes: int = 10000, frames_to_skip: int = 4): GAMMA = 0.99 EPSILON = 1.0 EPS_END = 0.1 EPS_DECAY = 1e7 TARGET_UPDATE = 10 BATCH_SIZE = 64 epsilon_by_frame = lambda frame_idx: EPS_END + ( EPSILON - EPS_END) * math.exp(-1. * frame_idx / EPS_DECAY) # Get screen size so that we can initialize layers correctly based on shape # returned from AI gym. Typical dimensions at this point are close to 3x40x90 # which is the result of a clamped and down-scaled render buffer in get_screen() env = PacmanEnv(layout=layout) env = SkipFrame(env, skip=frames_to_skip) env = GrayScaleObservation(env) env = ResizeObservation(env, shape=84) env = FrameStack(env, num_stack=4) screen = env.reset(mode='rgb_array') # Get number of actions from gym action space n_actions = env.action_space.n policy_net = DQN(screen.shape, n_actions).to(device) target_net = DQN(screen.shape, n_actions).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.RMSprop(policy_net.parameters()) memory = ReplayBuffer(BATCH_SIZE) for i_episode in range(episodes): # Initialize the environment and state state = env.reset(mode='rgb_array') ep_reward = 0. EPSILON = epsilon_by_frame(i_episode) for t in count(): # Select and perform an action env.render(mode='human') action = select_action(state, EPSILON, policy_net, n_actions) next_state, reward, done, info = env.step(action) reward = max(-1.0, min(reward, 1.0)) ep_reward += reward memory.cache(state, next_state, action, reward, done) # Observe new state if done: next_state = None # Move to the next state state = next_state # Perform one step of the optimization (on the target network) optimize_model(memory, policy_net, optimizer, target_net, GAMMA) if done: print("Episode #{}, lasts for {} timestep, total reward: {}". format(i_episode, t + 1, ep_reward)) break # Update the target network, copying all weights and biases in DQN if i_episode % TARGET_UPDATE == 0: target_net.load_state_dict(policy_net.state_dict()) if i_episode % 1000 == 0: save_model(target_net, 'pacman.pth') print('Complete') env.render() env.close() save_model(target_net, 'pacman.pth')
def main(): # Parse arguments parser = argparse.ArgumentParser(description='REINFORCE using PyTorch') # Logging parser.add_argument('--alias', type=str, default='base', help="""Alias of the model.""") parser.add_argument('--render_interval', type=int, default=100, help='interval between rendered epochs (default: 100)') # Learning parameters parser.add_argument('--gamma', type=float, default=0.99, help='discount factor (default: 0.99)') parser.add_argument('--lr', type=float, default=0.01, help='learning rate (default: 0.01)') parser.add_argument('--no-cuda', action='store_true', default=False, help='Enables CUDA training') parser.add_argument('--eb', type=int, default=1, help='episode batch (default: 1)') parser.add_argument('--episodes', type=int, default=10000, help='simulated episodes (default: 10000)') parser.add_argument('--policy', type=str, default=None, help="""Policy checkpoint to restore.""") parser.add_argument('--seed', type=int, default=42, help='random seed (default: 42)') parser.add_argument('--horizon', type=int, default=1000, help='horizon (default: 1000)') parser.add_argument('--baseline', action='store_true', help='use the baseline for the REINFORCE algorithm') args = parser.parse_args() # Check cuda args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if args.cuda else "cpu") # Initialize environment env = gym.make('CarRacing-v0') env = CropCarRacing(env) env = ResizeObservation(env, (32, 32, 3)) env = Scolorized(env, weights=[0.0, 1.0, 0.0]) env = NormalizeRGB(env) env.seed(args.seed) torch.manual_seed(args.seed) print("Env final goal:", env.spec.reward_threshold) # Create the alias for the run alias = '%s_%s' % (args.alias, time.time()) # Use alias for checkpoints checkpoint_best_filename = 'policy_weights/' + alias + '_best.torch' checkpoint_final_filename = 'policy_weights/' + alias + '_final.torch' if not os.path.exists('weights/'): os.makedirs('weights/') # Tensorboard writer writer = SummaryWriter('logs/' + alias) # Create VAE policy vape = VAEPolicy() optimizer = optim.Adam(vape.parameters(), lr=1e-04) # Animation of environment obs = env.reset() obs_torch = torch.from_numpy(NCHW([obs])).float().to(device) rebuild = vape.encode_decode(obs_torch) rebuild = NHWC(rebuild.detach().numpy()[0]) fig1 = plt.figure() if len(obs.shape) == 3 and (obs.shape[-1] == 1): im = plt.imshow(side_by_side(obs, rebuild), cmap="Greys") else: im = plt.imshow(side_by_side(obs, rebuild)) done = False HORIZON = 200 timestep = 0 # Setting animation update function def updatefig(*args): nonlocal done nonlocal obs nonlocal HORIZON nonlocal timestep obs_torch = torch.from_numpy(NCHW([obs])).float().to(device) if not done and timestep < HORIZON: action, action_proba = vape.act(obs_torch) action = action[0].detach().numpy() obs, reward, done, info = env.step(action) env.render(mode='human') timestep += 1 else: done = False obs = env.reset() timestep = 0 rebuild = vape.encode_decode(obs_torch) rebuild = NHWC(rebuild.detach().numpy()[0]) im.set_array(side_by_side(obs, rebuild)) vape.optimize_vae(obs_torch, optimizer) time.sleep(0.01) return im, # Start animation ani = animation.FuncAnimation(fig1, updatefig, interval=50, blit=True) plt.show() # Close env and writer env.close() writer.close()
# Check cuda args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if args.cuda else "cpu") # Loading the dataset if args.dataset: dataset = np.array(pickle.load(open(args.dataset, 'rb'))) N_SAMPLES, W, H, CHANNELS = dataset.shape print("Dataset size:", N_SAMPLES) print("Channels:", CHANNELS) print("Image dim: (%d,%d)" % (W,H)) dataset_torch = torch.from_numpy(NCHW(dataset)).float().to(device) else: print("Using gym environment directly.") env = gym.make('CarRacing-v0') env = CropCarRacing(env) env = ResizeObservation(env, (32, 32, 3)) env = NormalizeRGB(env) env = Scolorized(env, weights=[0.0, 1.0, 0.0]) env.seed(args.seed) # Network creation VAE_class = VAEbyArch(args.arch) vae = VAE_class(latent_size=args.latent_size).to(device) # Restore checkpoint assert args.vae, "No checkpoint provided." vae.load_state_dict(torch.load(args.vae)) vae.eval() if args.dataset: # Single observation display mu, log_sigma, z, rebuild = vae(dataset_torch[args.sample:args.sample+1])
import matplotlib.pyplot as plt from tqdm import trange from wrappers import ResizeObservation, CropCarRacing, Scolorized, NormalizeRGB ''' Car Racing action space: Box(3) floats action[0]: steer, -1 to 1 action[1]: gas. 0 to 1 action[2]: brake, 0 to 1 ''' env = gym.make('CarRacing-v0') env = CropCarRacing(env) env = ResizeObservation(env, (64, 64, 3)) #env = Scolorized(env) env = NormalizeRGB(env) dataset = [] env.seed(42) obs = env.reset() done = False print(env.observation_space) print(env.action_space) for i in trange(50): action = env.action_space.sample() obs, reward, done, info = env.step(action) env.render()
def main(): # Parse arguments parser = argparse.ArgumentParser(description='REINFORCE using PyTorch') parser.add_argument('--gamma', type=float, default=0.99, help='discount factor (default: 0.99)') parser.add_argument('--lr', type=float, default=0.01, help='learning rate (default: 0.01)') parser.add_argument('--vae_lr', type=float, default=1e-04, help='learning rate (default: 0.01)') parser.add_argument('--eb', type=int, default=1, help='episode batch (default: 1)') parser.add_argument('--episodes', type=int, default=10000, help='simulated episodes (default: 10000)') parser.add_argument('--controller', type=str, default=None, help="""Controller checkpoint to restore.""") parser.add_argument('--seed', type=int, default=42, help='random seed (default: 42)') parser.add_argument('--horizon', type=int, default=1000, help='horizon (default: 1000)') parser.add_argument('--render', action='store_true', help='render the environment') parser.add_argument('--baseline', action='store_true', help='use the baseline for the REINFORCE algorithm') parser.add_argument('--render_interval', type=int, default=100, help='interval between rendered epochs (default: 100)') parser.add_argument('--avoidance', type=str, default='self', help='Avoidance scheme') parser.add_argument('--dist', type=str, default='beta', help='Action probability distribution.') parser.add_argument('--avoidance_max', type=float, default=1.0, help='Avoidance max value') args = parser.parse_args() # Initialize environment env = gym.make('CarRacing-v0') env = CropCarRacing(env) env = ResizeObservation(env, (64, 64, 3)) env = Scolorized(env, weights=[0.0, 1.0, 0.0]) env = NormalizeRGB(env) #env = ActionScaler(env) env.seed(args.seed) torch.manual_seed(args.seed) print("Env final goal:", env.spec.reward_threshold) # Create the alias for the run alias = 'reinforce_lr=%s_eb=%s_seed=%s' % (args.lr, args.eb, args.seed) if args.baseline: alias += '_baseline' alias += '_%s' % (time.time()) # Use alias for checkpoints checkpoint_best_filename = 'weights/' + alias + '_best.torch' checkpoint_final_filename = 'weights/' + alias + '_final.torch' if not os.path.exists('weights/'): os.makedirs('weights/') # Tensorboard writer writer = SummaryWriter('logs/' + alias) # Declare vae policy vape = VAEPolicy(avoidance=args.avoidance, avoidance_threshold=args.avoidance_max, vae_lr=args.vae_lr) if args.controller: vape.load_state_dict(torch.load(args.controller)) # Declare sampler sampler = Sampler(env, args.horizon) # Run episodes running_reward = deque(maxlen=100) best_reward = None for i_episode in trange(0, args.episodes, args.eb, desc="Episodes", unit_scale=args.eb): # Sample trajectories trajectories, losses_and_info = sampler.sample(args.eb, vape, render=False)#(i_episode%args.render_interval==0)) reco_loss, norm_loss, total_loss, added_to_batch, avoidance_score = zip(*losses_and_info) # Update policy finish_episode(trajectories, vape, args) # Get quantities for summaries episode_rewards = np.sum(trajectories['rewards'], axis=1) mean_reward = np.mean(episode_rewards) episode_lens = np.sum(trajectories['mask'], axis=1) for sub_i in range(args.eb): # Summaries: mean episode reward for 100 episodes running_reward.append(episode_rewards[sub_i]) writer.add_scalar('data/mean_100episode_reward', np.mean(running_reward), i_episode + sub_i) # Summaries: mean episode len writer.add_scalar('data/episode_len', episode_lens[sub_i], i_episode + sub_i) writer.add_scalar('data/episode_reward', episode_rewards[sub_i], i_episode + sub_i) writer.add_scalar('data/added_to_batch', np.sum(added_to_batch), i_episode/args.eb) writer.add_scalar('data/mean_avoidance', np.mean(avoidance_score), i_episode/args.eb) writer.add_scalar('data/reco_loss', np.mean(reco_loss), i_episode/args.eb) writer.add_scalar('data/norm_loss', np.mean(norm_loss), i_episode/args.eb) # Save best model if needed if (best_reward is None) or (mean_reward > best_reward): best_reward = mean_reward print("Saving best model:", best_reward) torch.save(vape.state_dict(), checkpoint_best_filename) # Check if completed if np.mean(running_reward) > env.spec.reward_threshold: print("Solved, stopping. Mean reward:", np.mean(running_reward)) break # Save final model torch.save(vape.state_dict(), checkpoint_final_filename) # Close env and writer env.close() writer.close()