def __init__(self, cfg, tetris): self.num_actions = cfg.MODEL.SIZE_ACTION self.gamma = cfg.SOLVER.GAMMA self.BATCH_SIZE = cfg.SOLVER.BATCH_SIZE transition = namedtuple('Transicion', ('state', 'action', 'next_state', 'reward')) self.memory = ReplayMemory(cfg.SOLVER.CAPACITY, transition) self.model = get_model(cfg) self.target_net = copy.deepcopy(self.model) self.target_net.load_state_dict(self.model.state_dict()) self.optimizer = optim.Adam(self.model.parameters(), lr=0.001) self.tetris = tetris
def __init__( self, network: nn.Module, actions: int, logger: Optional = None, learning_rate: float = 0.00025, replay_start_size: int = 50000, replay_size: int = 1000000, batch_size: int = 32, sync_target_step: int = 10000, update_frequency: int = 4, gradient_clipping: bool = False, reward_clipping: bool = True, gamma: float = 0.99, epsilon_start: float = 1.0, epsilon_end: float = 0.1, epsilon_end_step: int = 1000000, epsilon_testing: float = 0.05, training: bool = True, device: str = 'gpu', seed: Optional[int] = None ): """ Initializes a DQN agent Args: network: a neural network to learn the Q-function actions: number of actions the agent can take logger: a logger that has a write method which receives scalars and a timestep learning_rate: the learning rate for the optimizer replay_start_size: minimum number of samples in memory before optimization starts, is also the number of time steps taken before reducing epsilon replay_size: maximum size of the replay buffer batch_size: number of samples for each parameter update sync_target_step: number of policy updates before updating the target network parameters update_frequency: number of time steps between each learning step gradient_clipping: if True, the gradients are clipped between -1 and 1 reward_clipping: if True, the rewards are clipped between -1 and 1 gamma: the discount factor for the MDP epsilon_start: value of epsilon at start of training epsilon_end: value of epsilon at end of training epsilon_end_step: number of time steps where the epsilon is linearly decayed epsilon_testing: value of epsilon during testing training: if True the agent is training if False is testing device: device to be used in pytorch, either gpu` or `cpu` seed: the random seed """ if seed is not None: torch.random.manual_seed(seed) # selecting the device to use self._device = torch.device("cuda" if torch.cuda.is_available() and device == 'gpu' else "cpu") print(f"Using {self._device}...") # creating the target network, eval doesn't do anything since we are not using dropout self._policy_network = network.to(self._device) self._target_network = deepcopy(self._policy_network).to(self._device) self._target_network.eval() # saving the logger if logger is not None: self._logger = logger # initializing the optimizer and saving some optimization related parameters self._learning_rate = learning_rate # self._optimizer = RMSprop(self._policy_network.parameters(), self._learning_rate) self._optimizer = torch.optim.Adam(self._policy_network.parameters(), lr=0.0000625, eps=0.00015) # self._optimizer = torch.optim.Adam(self._policy_network.parameters(), lr=0.0000125, eps=0.00015) self._batch_size = batch_size self._sync_target_step = sync_target_step self._update_frequency = update_frequency self._gradient_clipping = gradient_clipping self._loss_fn = torch.nn.L1Loss(reduction="none") self._reward_clipping = reward_clipping # setting the action space self._actions = actions self._num_steps = 0 # setting the replay buffer self._replay_start_size = replay_start_size self._replay_size = replay_size self._memory = ReplayMemory(size=replay_size, seed=seed) # setting the MDP parameters self._gamma = gamma # setting the exploration parameters self._epsilon_end = epsilon_end self._epsilon_diff = epsilon_start - epsilon_end self._epsilon_end_step = epsilon_end_step self._epsilon_testing = epsilon_testing self._epsilon = epsilon_start # setting the training status self._training = training self._timestep = None self._next_timestep = None
# helper method for reshaping the cartpole observation def reshape(state): return np.reshape(state, [1, 4]) if __name__ == '__main__': tf.compat.v1.disable_eager_execution() max_score = 0 n_episodes = 5000 max_env_steps = 1000 env = gym.make('CartPole-v0') agent = DQNAgent(env=env, net=NN(alpha=0.001, decay=0.0001), memory=ReplayMemory(size=100000)) if max_env_steps is not None: env._max_episode_steps = max_env_steps for e in range(n_episodes): # reset the env state = reshape(env.reset()) done = False score = 0 # play until env done while not done: action = agent.act(state) next_state, reward, done, _ = env.step(action) # env.render() next_state = reshape(next_state)
def main(): parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='Breakout-v0', help='Atari env name') parser.add_argument( '-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') parser.add_argument('--input_shape', nargs=2, type=int, default=None, help='Input shape') parser.add_argument('--num_frame', default=4, type=int, help='Number of frames in a state') parser.add_argument('--discount', default=0.99, type=float, help='Discount factor gamma') parser.add_argument('--online_train_interval', default=4, type=int, help='Interval to train the online network') parser.add_argument('--target_reset_interval', default=10000, type=int, help='Interval to reset the target network') parser.add_argument('--action_change_interval', default=1, type=int, help='Interval to change action') parser.add_argument('--print_loss_interval', default=100, type=int, help='Interval to print losses') parser.add_argument('--replay_buffer_size', default=100000, type=int, help='Replay buffer size') parser.add_argument('--num_burn_in', default=25000, type=int, help='Number of samples filled in memory before update') parser.add_argument('--batch_size', default=32, type=int, help='How many samples in each minibatch') parser.add_argument('--learning_rate', default=1e-4, type=float, help='Learning rate alpha') parser.add_argument('--explore_prob', default=0.05, type=float, help='Exploration probability in epsilon-greedy') parser.add_argument('--decay_prob_start', default=1.0, type=float, help='Starting probability in linear-decay epsilon-greedy') parser.add_argument('--decay_prob_end', default=0.1, type=float, help='Ending probability in linear-decay epsilon-greedy') parser.add_argument('--decay_steps', default=1000000, type=int, help='Decay steps in linear-decay epsilon-greedy') parser.add_argument('--num_train', default=5000000, type=int, help='Number of training sampled interactions with the environment') parser.add_argument('--max_episode_length', default=999999, type=int, help='Maximum length of an episode') parser.add_argument('--save_interval', default=100000, type=int, help='Interval to save weights and memory') parser.add_argument('--model_name', default='dqn', type=str, help='Model name') parser.add_argument('--eval_interval', default=10000, type=int, help='Evaluation interval') parser.add_argument('--eval_episodes', default=20, type=int, help='Number of episodes in evaluation') parser.add_argument('--double_q', default=False, type=bool, help='Invoke double Q net') parser.add_argument('--do_render', default=False, type=bool, help='Do rendering or not') parser.add_argument('--read_weights', default=None, type=str, help='Read weights from file') parser.add_argument('--read_memory', default=None, type=str, help='Read memory from file') args = parser.parse_args() print '########## All arguments ##########:', args args.input_shape = tuple(args.input_shape) args.output = get_output_folder(args.output, args.env) env = gym.make(args.env) num_actions = env.action_space.n opt_adam = Adam(lr=args.learning_rate) model_online = create_model(args.num_frame, args.input_shape, num_actions, model_name=args.model_name) model_target = create_model(args.num_frame, args.input_shape, num_actions, model_name=args.model_name) q_network = {'online': model_online, 'target': model_target} preproc = AtariPreprocessor(args.input_shape) memory = ReplayMemory(args.replay_buffer_size, args.num_frame) policy_random = UniformRandomPolicy(num_actions) policy_train = LinearDecayGreedyEpsilonPolicy(args.decay_prob_start, args.decay_prob_end, args.decay_steps) policy_eval = GreedyEpsilonPolicy(args.explore_prob) policy = {'random': policy_random, 'train': policy_train, 'eval': policy_eval} agent = DQNAgent(num_actions, q_network, preproc, memory, policy, args) agent.compile([mean_huber_loss, null_loss], opt_adam) if args.read_weights is not None: agent.q_network['online'].load_weights(args.read_weights) if args.read_memory is not None: with open(args.read_memory, 'rb') as save_memory: agent.memory = pickle.load(save_memory) print '########## training #############' agent.fit(env)
import gym from dqn.bots import AtariBot from dqn.policy import DDQNPolicy from dqn.memory import ReplayMemory GAME = 'Breakout-v0' #TODO List params to tune here, eventually migrate this to a readme if __name__ == "__main__": policy = DDQNPolicy() memory = ReplayMemory() game = gym.make(GAME) game.ale.setInt(b'frame_skip', 4) robot = AtariBot(policy=policy, memory=memory) robot.train(game=game, ckpt_dir="models")