def experiment(n_epochs, n_steps, n_steps_test): np.random.seed() # MDP horizon = 1000 gamma = 0.99 gamma_eval = 1. mdp = Gym('Acrobot-v1', horizon, gamma) # Policy epsilon = LinearDecayParameter(value=1., min_value=.01, n=5000) epsilon_test = Parameter(value=0.) epsilon_random = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon_random) # Settings initial_replay_size = 500 max_replay_size = 5000 target_update_frequency = 100 batch_size = 200 n_features = 80 train_frequency = 1 # Approximator input_shape = mdp.info.observation_space.shape approximator_params = dict(network=Network, optimizer={ 'class': optim.Adam, 'params': { 'lr': .001 } }, loss=F.smooth_l1_loss, n_features=n_features, input_shape=input_shape, output_shape=mdp.info.action_space.size, n_actions=mdp.info.action_space.n) # Agent agent = DQN(PyTorchApproximator, pi, mdp.info, approximator_params=approximator_params, batch_size=batch_size, n_approximators=1, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, target_update_frequency=target_update_frequency) # Algorithm core = Core(agent, mdp) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size) # RUN pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma_eval) print('J: ', np.mean(J)) for n in range(n_epochs): print('Epoch: ', n) pi.set_epsilon(epsilon) core.learn(n_steps=n_steps, n_steps_per_fit=train_frequency) pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma_eval) print('J: ', np.mean(J)) print('Press a button to visualize acrobot') input() core.evaluate(n_episodes=5, render=True)
def experiment(): np.random.seed() # Argument parser parser = argparse.ArgumentParser() arg_game = parser.add_argument_group('Game') arg_game.add_argument("--name", type=str, default='BreakoutDeterministic-v4', help='Gym ID of the Atari game.') arg_game.add_argument("--screen-width", type=int, default=84, help='Width of the game screen.') arg_game.add_argument("--screen-height", type=int, default=84, help='Height of the game screen.') arg_mem = parser.add_argument_group('Replay Memory') arg_mem.add_argument("--initial-replay-size", type=int, default=50000, help='Initial size of the replay memory.') arg_mem.add_argument("--max-replay-size", type=int, default=500000, help='Max size of the replay memory.') arg_net = parser.add_argument_group('Deep Q-Network') arg_net.add_argument("--optimizer", choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'], default='adam', help='Name of the optimizer to use to learn.') arg_net.add_argument("--learning-rate", type=float, default=.00025, help='Learning rate value of the optimizer. Only used' 'in rmspropcentered') arg_net.add_argument("--decay", type=float, default=.95, help='Discount factor for the history coming from the' 'gradient momentum in rmspropcentered') arg_net.add_argument("--epsilon", type=float, default=.01, help='Epsilon term used in rmspropcentered') arg_alg = parser.add_argument_group('Algorithm') arg_alg.add_argument("--algorithm", choices=['dqn', 'ddqn', 'adqn'], default='dqn', help='Name of the algorithm. dqn is for standard' 'DQN, ddqn is for Double DQN and adqn is for' 'Averaged DQN.') arg_alg.add_argument("--n-approximators", type=int, default=1, help="Number of approximators used in the ensemble for" "Averaged DQN.") arg_alg.add_argument("--batch-size", type=int, default=32, help='Batch size for each fit of the network.') arg_alg.add_argument("--history-length", type=int, default=4, help='Number of frames composing a state.') arg_alg.add_argument("--target-update-frequency", type=int, default=10000, help='Number of learning step before each update of' 'the target network.') arg_alg.add_argument("--evaluation-frequency", type=int, default=250000, help='Number of learning step before each evaluation.' 'This number represents an epoch.') arg_alg.add_argument("--train-frequency", type=int, default=4, help='Number of learning steps before each fit of the' 'neural network.') arg_alg.add_argument("--max-steps", type=int, default=50000000, help='Total number of learning steps.') arg_alg.add_argument("--final-exploration-frame", type=int, default=1000000, help='Number of steps until the exploration rate stops' 'decreasing.') arg_alg.add_argument("--initial-exploration-rate", type=float, default=1., help='Initial value of the exploration rate.') arg_alg.add_argument("--final-exploration-rate", type=float, default=.1, help='Final value of the exploration rate. When it' 'reaches this values, it stays constant.') arg_alg.add_argument("--test-exploration-rate", type=float, default=.05, help='Exploration rate used during evaluation.') arg_alg.add_argument("--test-samples", type=int, default=125000, help='Number of steps for each evaluation.') arg_alg.add_argument("--max-no-op-actions", type=int, default=8, help='Maximum number of no-op action performed at the' 'beginning of the episodes. The minimum number is' 'history_length. This number is 30 in the DQN' 'Deepmind paper, but they consider the first 30' 'frame without frame skipping.') arg_alg.add_argument("--no-op-action-value", type=int, default=0, help='Value of the no-op action.') arg_utils = parser.add_argument_group('Utils') arg_utils.add_argument('--load-path', type=str, help='Path of the model to be loaded.') arg_utils.add_argument('--save', action='store_true', help='Flag specifying whether to save the model.') arg_utils.add_argument('--render', action='store_true', help='Flag specifying whether to render the game.') arg_utils.add_argument('--quiet', action='store_true', help='Flag specifying whether to hide the progress' 'bar.') arg_utils.add_argument('--debug', action='store_true', help='Flag specifying whether the script has to be' 'run in debug mode.') args = parser.parse_args() scores = list() # Evaluation of the model provided by the user. if args.load_path: # MDP mdp = Atari(args.name, args.screen_width, args.screen_height, ends_at_life=False) # Policy epsilon_test = Parameter(value=args.test_exploration_rate) pi = EpsGreedy(epsilon=epsilon_test) # Approximator input_shape = (args.screen_height, args.screen_width, args.history_length) approximator_params = dict( input_shape=input_shape, output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n, name='test', load_path=args.load_path, optimizer={'name': args.optimizer, 'lr': args.learning_rate, 'decay': args.decay, 'epsilon': args.epsilon} ) approximator = ConvNet # Agent algorithm_params = dict( batch_size=1, train_frequency=1, target_update_frequency=1, initial_replay_size=0, max_replay_size=0, history_length=args.history_length, max_no_op_actions=args.max_no_op_actions, no_op_action_value=args.no_op_action_value, dtype=np.uint8 ) agent = DQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core_test = Core(agent, mdp) # Evaluate model pi.set_epsilon(epsilon_test) dataset = core_test.evaluate(n_steps=args.test_samples, render=args.render, quiet=args.quiet) get_stats(dataset) else: # DQN learning run # Summary folder folder_name = './logs/atari_' + args.algorithm + '_' + args.name +\ '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') # Settings if args.debug: initial_replay_size = 50 max_replay_size = 500 train_frequency = 5 target_update_frequency = 10 test_samples = 20 evaluation_frequency = 50 max_steps = 1000 else: initial_replay_size = args.initial_replay_size max_replay_size = args.max_replay_size train_frequency = args.train_frequency target_update_frequency = args.target_update_frequency test_samples = args.test_samples evaluation_frequency = args.evaluation_frequency max_steps = args.max_steps # MDP mdp = Atari(args.name, args.screen_width, args.screen_height, ends_at_life=True) # Policy epsilon = LinearDecayParameter(value=args.initial_exploration_rate, min_value=args.final_exploration_rate, n=args.final_exploration_frame) epsilon_test = Parameter(value=args.test_exploration_rate) epsilon_random = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon_random) # Approximator input_shape = (args.screen_height, args.screen_width, args.history_length) approximator_params = dict( input_shape=input_shape, output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n, folder_name=folder_name, optimizer={'name': args.optimizer, 'lr': args.learning_rate, 'decay': args.decay, 'epsilon': args.epsilon} ) approximator = ConvNet # Agent algorithm_params = dict( batch_size=args.batch_size, n_approximators=args.n_approximators, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, history_length=args.history_length, train_frequency=train_frequency, target_update_frequency=target_update_frequency, max_no_op_actions=args.max_no_op_actions, no_op_action_value=args.no_op_action_value, dtype=np.uint8 ) if args.algorithm == 'dqn': agent = DQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'ddqn': agent = DoubleDQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'adqn': agent = AveragedDQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core = Core(agent, mdp) # RUN # Fill replay memory with random dataset print_epoch(0) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=args.quiet) if args.save: agent.approximator.model.save() # Evaluate initial policy pi.set_epsilon(epsilon_test) if args.algorithm == 'ddqn': agent.policy.set_q(agent.target_approximator) mdp.set_episode_end(False) dataset = core.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) if args.algorithm == 'ddqn': agent.policy.set_q(agent.approximator) np.save(folder_name + '/scores.npy', scores) for n_epoch in range(1, max_steps // evaluation_frequency + 1): print_epoch(n_epoch) print('- Learning:') # learning step pi.set_epsilon(epsilon) mdp.set_episode_end(True) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=train_frequency, quiet=args.quiet) if args.save: agent.approximator.model.save() print('- Evaluation:') # evaluation step pi.set_epsilon(epsilon_test) if args.algorithm == 'ddqn': agent.policy.set_q(agent.target_approximator) mdp.set_episode_end(False) dataset = core.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) if args.algorithm == 'ddqn': agent.policy.set_q(agent.approximator) np.save(folder_name + '/scores.npy', scores) return scores
optimizer=optimizer, loss=F.smooth_l1_loss) approximator = TorchApproximator # Agent algorithm_params = dict(batch_size=32, target_update_frequency=target_update_frequency // train_frequency, replay_memory=None, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size) agent = DQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core = Core(agent, mdp) # RUN # Fill replay memory with random dataset print_epoch(0) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size) # Evaluate initial policy pi.set_epsilon(epsilon_test) mdp.set_episode_end(False)
def experiment(alg): gym.logger.setLevel(0) np.random.seed(88) tf.set_random_seed(88) # DQN settings initial_replay_size = 500 max_replay_size = 1000 train_frequency = 50 target_update_frequency = 100 evaluation_frequency = 200 max_steps = 2000 # MDP train mdp = Atari('BreakoutDeterministic-v4', 84, 84, ends_at_life=True) # Policy epsilon = LinearDecayParameter(value=1, min_value=.1, n=10) epsilon_test = Parameter(value=.05) epsilon_random = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon_random) # Approximator input_shape = (84, 84, 4) approximator_params = dict( input_shape=input_shape, output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n, input_preprocessor=[Scaler(mdp.info.observation_space.high[0, 0])], optimizer={ 'name': 'rmsprop', 'lr': .00025, 'decay': .95, 'epsilon': 1e-10 }) approximator = ConvNet # Agent algorithm_params = dict(batch_size=32, initial_replay_size=initial_replay_size, n_approximators=2 if alg == 'adqn' else 1, max_replay_size=max_replay_size, history_length=4, train_frequency=train_frequency, target_update_frequency=target_update_frequency, max_no_op_actions=10, no_op_action_value=0) fit_params = dict() agent_params = { 'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params } if alg == 'dqn': agent = DQN(approximator, pi, mdp.info, agent_params) elif alg == 'ddqn': agent = DoubleDQN(approximator, pi, mdp.info, agent_params) elif alg == 'adqn': agent = AveragedDQN(approximator, pi, mdp.info, agent_params) # Algorithm core = Core(agent, mdp) # DQN # fill replay memory with random dataset core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=True) # evaluate initial policy pi.set_epsilon(epsilon_test) mdp.set_episode_end(ends_at_life=False) for n_epoch in xrange(1, max_steps / evaluation_frequency + 1): # learning step pi.set_epsilon(epsilon) mdp.set_episode_end(ends_at_life=True) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=train_frequency, quiet=True) # evaluation step pi.set_epsilon(epsilon_test) mdp.set_episode_end(ends_at_life=False) w = agent.approximator.model.get_weights(only_trainable=True) return w
def experiment(): np.random.seed() # Argument parser parser = argparse.ArgumentParser() arg_game = parser.add_argument_group('Game') arg_game.add_argument("--name", type=str, default='BreakoutDeterministic-v4', help='Gym ID of the Atari game.') arg_game.add_argument("--screen-width", type=int, default=84, help='Width of the game screen.') arg_game.add_argument("--screen-height", type=int, default=84, help='Height of the game screen.') arg_mem = parser.add_argument_group('Replay Memory') arg_mem.add_argument("--initial-replay-size", type=int, default=50000, help='Initial size of the replay memory.') arg_mem.add_argument("--max-replay-size", type=int, default=500000, help='Max size of the replay memory.') arg_mem.add_argument("--prioritized", action='store_true', help='Whether to use prioritized memory or not.') arg_net = parser.add_argument_group('Deep Q-Network') arg_net.add_argument("--optimizer", choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'], default='rmsprop', help='Name of the optimizer to use.') arg_net.add_argument("--learning-rate", type=float, default=.00025, help='Learning rate value of the optimizer.') arg_net.add_argument("--decay", type=float, default=.95, help='Discount factor for the history coming from the' 'gradient momentum in rmspropcentered and' 'rmsprop') arg_net.add_argument("--epsilon", type=float, default=1e-8, help='Epsilon term used in rmspropcentered and' 'rmsprop') arg_alg = parser.add_argument_group('Algorithm') arg_alg.add_argument("--algorithm", choices=['dqn', 'ddqn', 'adqn', 'cdqn'], default='dqn', help='Name of the algorithm. dqn is for standard' 'DQN, ddqn is for Double DQN and adqn is for' 'Averaged DQN.') arg_alg.add_argument("--n-approximators", type=int, default=1, help="Number of approximators used in the ensemble for" "Averaged DQN.") arg_alg.add_argument("--batch-size", type=int, default=32, help='Batch size for each fit of the network.') arg_alg.add_argument("--history-length", type=int, default=4, help='Number of frames composing a state.') arg_alg.add_argument("--target-update-frequency", type=int, default=10000, help='Number of collected samples before each update' 'of the target network.') arg_alg.add_argument("--evaluation-frequency", type=int, default=250000, help='Number of collected samples before each' 'evaluation. An epoch ends after this number of' 'steps') arg_alg.add_argument("--train-frequency", type=int, default=4, help='Number of collected samples before each fit of' 'the neural network.') arg_alg.add_argument("--max-steps", type=int, default=50000000, help='Total number of collected samples.') arg_alg.add_argument("--final-exploration-frame", type=int, default=1000000, help='Number of collected samples until the exploration' 'rate stops decreasing.') arg_alg.add_argument("--initial-exploration-rate", type=float, default=1., help='Initial value of the exploration rate.') arg_alg.add_argument("--final-exploration-rate", type=float, default=.1, help='Final value of the exploration rate. When it' 'reaches this values, it stays constant.') arg_alg.add_argument("--test-exploration-rate", type=float, default=.05, help='Exploration rate used during evaluation.') arg_alg.add_argument("--test-samples", type=int, default=125000, help='Number of collected samples for each' 'evaluation.') arg_alg.add_argument("--max-no-op-actions", type=int, default=30, help='Maximum number of no-op actions performed at the' 'beginning of the episodes.') arg_alg.add_argument("--n-atoms", type=int, default=51, help='Number of atoms for Categorical DQN.') arg_alg.add_argument("--v-min", type=int, default=-10, help='Minimum action-value for Categorical DQN.') arg_alg.add_argument("--v-max", type=int, default=10, help='Maximum action-value for Categorical DQN.') arg_utils = parser.add_argument_group('Utils') arg_utils.add_argument('--use-cuda', action='store_true', help='Flag specifying whether to use the GPU.') arg_utils.add_argument('--load-path', type=str, help='Path of the model to be loaded.') arg_utils.add_argument('--save', action='store_true', help='Flag specifying whether to save the model.') arg_utils.add_argument('--render', action='store_true', help='Flag specifying whether to render the game.') arg_utils.add_argument('--quiet', action='store_true', help='Flag specifying whether to hide the progress' 'bar.') arg_utils.add_argument('--debug', action='store_true', help='Flag specifying whether the script has to be' 'run in debug mode.') args = parser.parse_args() scores = list() optimizer = dict() if args.optimizer == 'adam': optimizer['class'] = optim.Adam optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon) elif args.optimizer == 'adadelta': optimizer['class'] = optim.Adadelta optimizer['params'] = dict(lr=args.learning_rate, eps=args.epsilon) elif args.optimizer == 'rmsprop': optimizer['class'] = optim.RMSprop optimizer['params'] = dict(lr=args.learning_rate, alpha=args.decay, eps=args.epsilon) elif args.optimizer == 'rmspropcentered': optimizer['class'] = optim.RMSprop optimizer['params'] = dict(lr=args.learning_rate, alpha=args.decay, eps=args.epsilon, centered=True) else: raise ValueError # Evaluation of the model provided by the user. if args.load_path: # MDP mdp = Atari(args.name, args.screen_width, args.screen_height, ends_at_life=False, history_length=args.history_length, max_no_op_actions=args.max_no_op_actions) # Policy epsilon_test = Parameter(value=args.test_exploration_rate) pi = EpsGreedy(epsilon=epsilon_test) # Approximator input_shape = (args.history_length, args.screen_height, args.screen_width) approximator_params = dict( network=Network, input_shape=input_shape, output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n, load_path=args.load_path, optimizer=optimizer, loss=F.smooth_l1_loss, use_cuda=args.use_cuda ) approximator = TorchApproximator # Agent algorithm_params = dict( batch_size=1, train_frequency=1, target_update_frequency=1, initial_replay_size=0, max_replay_size=0 ) agent = DQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core_test = Core(agent, mdp) # Evaluate model pi.set_epsilon(epsilon_test) dataset = core_test.evaluate(n_steps=args.test_samples, render=args.render, quiet=args.quiet) get_stats(dataset) else: # DQN learning run # Summary folder folder_name = './logs/atari_' + args.algorithm + '_' + args.name +\ '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') pathlib.Path(folder_name).mkdir(parents=True) # Settings if args.debug: initial_replay_size = 50 max_replay_size = 500 train_frequency = 5 target_update_frequency = 10 test_samples = 20 evaluation_frequency = 50 max_steps = 1000 else: initial_replay_size = args.initial_replay_size max_replay_size = args.max_replay_size train_frequency = args.train_frequency target_update_frequency = args.target_update_frequency test_samples = args.test_samples evaluation_frequency = args.evaluation_frequency max_steps = args.max_steps # MDP mdp = Atari(args.name, args.screen_width, args.screen_height, ends_at_life=True, history_length=args.history_length, max_no_op_actions=args.max_no_op_actions) # Policy epsilon = LinearParameter(value=args.initial_exploration_rate, threshold_value=args.final_exploration_rate, n=args.final_exploration_frame) epsilon_test = Parameter(value=args.test_exploration_rate) epsilon_random = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon_random) class CategoricalLoss(nn.Module): def forward(self, input, target): input = input.clamp(1e-5) return -torch.sum(target * torch.log(input)) # Approximator input_shape = (args.history_length, args.screen_height, args.screen_width) approximator_params = dict( network=Network if args.algorithm != 'cdqn' else FeatureNetwork, input_shape=input_shape, output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n, n_features=Network.n_features, optimizer=optimizer, loss=F.smooth_l1_loss if args.algorithm != 'cdqn' else CategoricalLoss(), use_cuda=args.use_cuda ) approximator = TorchApproximator if args.prioritized: replay_memory = PrioritizedReplayMemory( initial_replay_size, max_replay_size, alpha=.6, beta=LinearParameter(.4, threshold_value=1, n=max_steps // train_frequency) ) else: replay_memory = None # Agent algorithm_params = dict( batch_size=args.batch_size, n_approximators=args.n_approximators, target_update_frequency=target_update_frequency // train_frequency, replay_memory=replay_memory, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size ) if args.algorithm == 'dqn': agent = DQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'ddqn': agent = DoubleDQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'adqn': agent = AveragedDQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'cdqn': agent = CategoricalDQN(pi, mdp.info, approximator_params=approximator_params, n_atoms=args.n_atoms, v_min=args.v_min, v_max=args.v_max, **algorithm_params) # Algorithm core = Core(agent, mdp) # RUN # Fill replay memory with random dataset print_epoch(0) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=args.quiet) if args.save: np.save(folder_name + '/weights-exp-0-0.npy', agent.approximator.get_weights()) # Evaluate initial policy pi.set_epsilon(epsilon_test) mdp.set_episode_end(False) dataset = core.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) np.save(folder_name + '/scores.npy', scores) for n_epoch in range(1, max_steps // evaluation_frequency + 1): print_epoch(n_epoch) print('- Learning:') # learning step pi.set_epsilon(epsilon) mdp.set_episode_end(True) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=train_frequency, quiet=args.quiet) if args.save: np.save(folder_name + '/weights-exp-0-' + str(n_epoch) + '.npy', agent.approximator.get_weights()) print('- Evaluation:') # evaluation step pi.set_epsilon(epsilon_test) mdp.set_episode_end(False) dataset = core.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) np.save(folder_name + '/scores.npy', scores) return scores