def breakout_a2c_evaluate(checkpoint_file_path, takes=10): model_checkpoint = torch.load(checkpoint_file_path) device = torch.device('cuda:0') env = FrameStack( ClassicAtariEnv('BreakoutNoFrameskip-v4').instantiate(preset='record'), k=4) model = StochasticPolicyModelFactory( input_block=ImageToTensorFactory(), backbone=NatureCnnFactory( input_width=84, input_height=84, input_channels=4)).instantiate(action_space=env.action_space) model.load_state_dict(model_checkpoint) model = model.to(device) model.eval() rewards = [] lengths = [] for i in range(takes): result = record_take(model, env, device) rewards.append(result['r']) lengths.append(result['l']) print(pd.DataFrame({'lengths': lengths, 'rewards': rewards}).describe())
def breakout_a2c(): device = torch.device('cuda:0') seed = 1001 # Set random seed in python std lib, numpy and pytorch set_seed(seed) # Create 16 environments evaluated in parallel in sub processess with all usual DeepMind wrappers # These are just helper functions for that vec_env = SubprocVecEnvWrapper(ClassicAtariEnv('BreakoutNoFrameskip-v4'), frame_history=4).instantiate( parallel_envs=16, seed=seed) # Again, use a helper to create a model # But because model is owned by the reinforcer, model should not be accessed using this variable # but from reinforcer.model property model = PolicyGradientModelFactory(backbone=NatureCnnFactory( input_width=84, input_height=84, input_channels=4)).instantiate( action_space=vec_env.action_space) # Reinforcer - an object managing the learning process reinforcer = OnPolicyIterationReinforcer( device=device, settings=OnPolicyIterationReinforcerSettings( discount_factor=0.99, batch_size=256, ), model=model, algo=A2CPolicyGradient(entropy_coefficient=0.01, value_coefficient=0.5, max_grad_norm=0.5), env_roller=StepEnvRoller( environment=vec_env, device=device, number_of_steps=5, discount_factor=0.99, )) # Model optimizer optimizer = optim.RMSprop(reinforcer.model.parameters(), lr=7.0e-4, eps=1e-3) # Overall information store for training information training_info = TrainingInfo( metrics=[ EpisodeRewardMetric( 'episode_rewards'), # Calculate average reward from episode ], callbacks=[StdoutStreaming() ] # Print live metrics every epoch to standard output ) # A bit of training initialization bookkeeping... training_info.initialize() reinforcer.initialize_training(training_info) training_info.on_train_begin() # Let's make 100 batches per epoch to average metrics nicely num_epochs = int(1.1e7 / (5 * 16) / 100) # Normal handrolled training loop for i in range(1, num_epochs + 1): epoch_info = EpochInfo(training_info=training_info, global_epoch_idx=i, batches_per_epoch=100, optimizer=optimizer) reinforcer.train_epoch(epoch_info) training_info.on_train_end()
def test_acer_breakout(): """ 1 iteration of ACER on breakout environment """ device = torch.device('cpu') seed = 1001 # Set random seed in python std lib, numpy and pytorch set_seed(seed) # Create 16 environments evaluated in parallel in sub processess with all usual DeepMind wrappers # These are just helper functions for that vec_env = SubprocVecEnvWrapper(ClassicAtariEnv('BreakoutNoFrameskip-v4'), frame_history=4).instantiate( parallel_envs=16, seed=seed) # Again, use a helper to create a model # But because model is owned by the reinforcer, model should not be accessed using this variable # but from reinforcer.model property model_factory = QPolicyGradientModelFactory(backbone=NatureCnnFactory( input_width=84, input_height=84, input_channels=4)) # Reinforcer - an object managing the learning process reinforcer = BufferedMixedPolicyIterationReinforcer( device=device, settings=BufferedMixedPolicyIterationReinforcerSettings( discount_factor=0.99, experience_replay=2, stochastic_experience_replay=False), model=model_factory.instantiate(action_space=vec_env.action_space), env=vec_env, algo=AcerPolicyGradient( model_factory=model_factory, entropy_coefficient=0.01, q_coefficient=0.5, rho_cap=10.0, retrace_rho_cap=1.0, trust_region=True, trust_region_delta=1.0, max_grad_norm=10.0, ), env_roller=ReplayQEnvRoller(environment=vec_env, device=device, number_of_steps=12, discount_factor=0.99, buffer_capacity=100, buffer_initial_size=100, frame_stack_compensation=4), ) # Model optimizer optimizer = optim.RMSprop(reinforcer.model.parameters(), lr=7.0e-4, eps=1e-3, alpha=0.99) # Overall information store for training information training_info = TrainingInfo( metrics=[ EpisodeRewardMetric( 'episode_rewards'), # Calculate average reward from episode ], callbacks=[] # Print live metrics every epoch to standard output ) # A bit of training initialization bookkeeping... training_info.initialize() reinforcer.initialize_training(training_info) training_info.on_train_begin() # Let's make 100 batches per epoch to average metrics nicely num_epochs = 1 # Normal handrolled training loop for i in range(1, num_epochs + 1): epoch_info = EpochInfo(training_info=training_info, global_epoch_idx=i, batches_per_epoch=1, optimizer=optimizer) reinforcer.train_epoch(epoch_info, interactive=False) training_info.on_train_end()
def test_prioritized_dqn_breakout(): """ Simple 1 iteration of DQN prioritized replay breakout """ device = torch.device('cpu') seed = 1001 # Set random seed in python std lib, numpy and pytorch set_seed(seed) # Only single environment for DQN env = ClassicAtariEnv('BreakoutNoFrameskip-v4').instantiate(seed=seed) # Again, use a helper to create a model # But because model is owned by the reinforcer, model should not be accessed using this variable # but from reinforcer.model property model_factory = QModelFactory(backbone=NatureCnnFactory( input_width=84, input_height=84, input_channels=4)) # Reinforcer - an object managing the learning process reinforcer = BufferedSingleOffPolicyIterationReinforcer( device=device, settings=BufferedSingleOffPolicyIterationReinforcerSettings( batch_rollout_rounds=4, batch_training_rounds=1, batch_size=32, discount_factor=0.99), environment=env, algo=DeepQLearning(model_factory=model_factory, double_dqn=False, target_update_frequency=10_000, max_grad_norm=0.5), model=model_factory.instantiate(action_space=env.action_space), env_roller=PrioritizedReplayRollerEpsGreedy( environment=env, device=device, epsilon_schedule=LinearAndConstantSchedule( initial_value=1.0, final_value=0.1, end_of_interpolation=0.1), batch_size=8, buffer_capacity=100, priority_epsilon=1.0e-6, buffer_initial_size=100, frame_stack=4, priority_exponent=0.6, priority_weight=LinearSchedule(initial_value=0.4, final_value=1.0), ), ) # Model optimizer optimizer = optim.RMSprop(reinforcer.model.parameters(), lr=2.5e-4, alpha=0.95, momentum=0.95, eps=1e-3) # Overall information store for training information training_info = TrainingInfo( metrics=[ EpisodeRewardMetric( 'episode_rewards'), # Calculate average reward from episode ], callbacks=[FrameTracker(100_000) ] # Print live metrics every epoch to standard output ) # A bit of training initialization bookkeeping... training_info.initialize() reinforcer.initialize_training(training_info) training_info.on_train_begin() # Let's make 100 batches per epoch to average metrics nicely num_epochs = 1 # Normal handrolled training loop for i in range(1, num_epochs + 1): epoch_info = EpochInfo(training_info=training_info, global_epoch_idx=i, batches_per_epoch=1, optimizer=optimizer) reinforcer.train_epoch(epoch_info, interactive=False) training_info.on_train_end()
for i in range(10): a = 0 #env.action_space.sample() for _ in range(4): (obnew, rr, done, _info) = env.step(a) (obnew2, rr2, done2, _info2) = env2.step(a) obnew2 = cv2.cvtColor( cv2.resize(obnew2, (84, 84), interpolation=cv2.INTER_LINEAR), cv2.COLOR_BGR2GRAY) print(obnew[:, :, 0].shape, rr, done, _info) print(obnew2.shape, rr2, done2, _info2) print(abs(obnew[:, :, 0] - obnew2).mean()) print(obnew.repeat(3, 2).shape) plt.imshow( np.concatenate((obnew.repeat(3, 2), obnew2[..., None].repeat(3, 2)))) plt.show() exit() env = SubprocVecEnvWrapper(ClassicAtariEnv('BreakoutNoFrameskip-v4'), frame_history=4).instantiate(parallel_envs=8, seed=1) print(env.observation_space, env.action_space, 'max_episode_steps', env.venv.specs[0]) print(env) ob = env.reset() print('ob', ob.shape) a = np.array([env.action_space.sample() for _ in range(8)]) print(a) (obnew, rr, done, _info) = env.step(a) print(obnew.shape, rr.shape, done.shape, _info)
def qbert_ppo(): device = torch.device('cuda:0') seed = 1001 # Set random seed in python std lib, numpy and pytorch set_seed(seed) # Create 16 environments evaluated in parallel in sub processess with all usual DeepMind wrappers # These are just helper functions for that vec_env = SubprocVecEnvWrapper(ClassicAtariEnv('QbertNoFrameskip-v4'), frame_history=4).instantiate( parallel_envs=8, seed=seed) # Again, use a helper to create a model # But because model is owned by the reinforcer, model should not be accessed using this variable # but from reinforcer.model property model = StochasticPolicyModelFactory( input_block=ImageToTensorFactory(), backbone=NatureCnnFactory( input_width=84, input_height=84, input_channels=4)).instantiate(action_space=vec_env.action_space) # Set schedule for gradient clipping. cliprange = LinearSchedule(initial_value=0.1, final_value=0.0) # Reinforcer - an object managing the learning process reinforcer = OnPolicyIterationReinforcer( device=device, settings=OnPolicyIterationReinforcerSettings(batch_size=256, experience_replay=4, number_of_steps=128), model=model, algo=PpoPolicyGradient(entropy_coefficient=0.01, value_coefficient=0.5, max_grad_norm=0.5, discount_factor=0.99, gae_lambda=0.95, cliprange=cliprange), env_roller=StepEnvRoller( environment=vec_env, device=device, )) # Model optimizer optimizer = optim.Adam(reinforcer.model.parameters(), lr=2.5e-4, eps=1.0e-5) # Overall information store for training information training_info = TrainingInfo( metrics=[ EpisodeRewardMetric( 'episode_rewards'), # Calculate average reward from episode ], callbacks=[ StdoutStreaming( ), # Print live metrics every epoch to standard output FrameTracker( 1.1e7 ) # We need frame tracker to track the progress of learning ]) # A bit of training initialization bookkeeping... training_info.initialize() reinforcer.initialize_training(training_info) training_info.on_train_begin() # Let's make 10 batches per epoch to average metrics nicely # Rollout size is 8 environments times 128 steps num_epochs = int(1.1e7 / (128 * 8) / 10) # Normal handrolled training loop for i in range(1, num_epochs + 1): epoch_info = EpochInfo(training_info=training_info, global_epoch_idx=i, batches_per_epoch=10, optimizer=optimizer) reinforcer.train_epoch(epoch_info) training_info.on_train_end()