def __init__(self, input_shape, nb_actions, gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000), learning_rate=0.00025, momentum=0.95, minibatch_size=32, memory_size=500000, train_after=10000, train_interval=4, target_update_interval=10000, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._history = History(input_shape) self._memory = RepMem(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Dense(input_shape, init=he_uniform(scale=0.01)), Dense(input_shape), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))]) self._action_value_net.update_signature(Tensor[input_shape]) self._target_net = self._action_value_net.clone(CloneMethod.freeze) @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): q_targets = compute_q_targets(post_states, rewards, terminals) q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) return huber_loss(q_targets, q_acted, 1.0) lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)
def _build_model(self): with default_options(init=he_uniform(), activation=relu, bias=True): model = Sequential([ Convolution((8, 8), 32, strides=(4, 4)), Convolution((4, 4), 64, strides=(2, 2)), Convolution((3, 3), 64, strides=(1, 1)), Dense(512, init=he_uniform(0.01)), Dense(self._nb_actions, activation=None, init=he_uniform(0.01)) ]) return model
def _build_model(self): with default_options(init=he_uniform(), activation=relu, bias=True): model = Sequential([ Convolution((4, 4), 64, strides=(2, 2), name='conv1'), Convolution((3, 3), 64, strides=(1, 1), name='conv2'), Dense(512, name='dense1', init=he_normal(0.01)), Dense(self._nb_actions, activation=None, init=he_normal(0.01), name='qvalues') ]) return model
def _build_model(self): with default_options(init=he_uniform(), activation=relu, bias=True): model = Sequential([ Convolution((8, 8), 32, strides=(4, 4)), Convolution((4, 4), 64, strides=(2, 2)), Convolution((3, 3), 64, strides=(1, 1)), Dense(512, init=he_normal(0.01)), Dense(self._nb_actions, activation=None, init=he_normal(0.01)) ]) return model
def _build_model(self): with default_options(init=he_uniform(), activation=relu, bias=True): model = Sequential([ Convolution((4, 4), 64, strides=(2, 2), name='conv1'), Convolution((3, 3), 64, strides=(1, 1), name='conv2'), Dense(512, name='dense1', init=he_normal(0.01)), Dense(self._nb_actions, activation=None, init=he_normal(0.01), name='qvalues') ]) return model
def __init__(self, input_shape, nb_actions, gamma=0.95, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 100000), learning_rate=0.01, momentum=0.8, minibatch_size=16, memory_size=15000, train_after=100, train_interval=100, target_update_interval=500, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._history = History(input_shape) self._memory = ReplayMemory(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 self._num_trains = 0 # Metrics accumulator self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] ''' # Action Value model (used by agent to interact with the environment) with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Convolution2D((8, 8), 16, strides=4), Convolution2D((4, 4), 32, strides=2), Convolution2D((3, 3), 32, strides=1), Dense(256, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) ''' with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Dense(7, init=he_uniform(scale=0.01)), Dense(8, init=he_uniform(scale=0.01)), #Dense(16, init=he_uniform(scale=0.01)), #Dense(32, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) self._action_value_net.update_signature(Tensor[input_shape]) # Target model used to compute the target Q-values in training, updated # less frequently for increased stability. self._target_net = self._action_value_net.clone(CloneMethod.freeze) # Function computing Q-values targets as part of the computation graph @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) # Define the loss, using Huber Loss (more robust to outliers) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): # Compute the q_targets q_targets = compute_q_targets(post_states, rewards, terminals) # actions is a 1-hot encoding of the action done by the agent q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) # Define training criterion as the Huber Loss function return huber_loss(q_targets, q_acted, 1.0) # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)
def __init__(self, input_shape, nb_actions, gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000), learning_rate=0.00025, momentum=0.95, minibatch_size=32, memory_size=500000, train_after=10000, train_interval=4, target_update_interval=10000, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._history = History(input_shape) self._memory = ReplayMemory(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 # Metrics accumulator self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Action Value model (used by agent to interact with the environment) with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Convolution2D((8, 8), 16, strides=4), Convolution2D((4, 4), 32, strides=2), Convolution2D((3, 3), 32, strides=1), Dense(256, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) self._action_value_net.update_signature(Tensor[input_shape]) # Target model used to compute the target Q-values in training, updated # less frequently for increased stability. self._target_net = self._action_value_net.clone(CloneMethod.freeze) # Function computing Q-values targets as part of the computation graph @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) # Define the loss, using Huber Loss (more robust to outliers) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): # Compute the q_targets q_targets = compute_q_targets(post_states, rewards, terminals) # actions is a 1-hot encoding of the action done by the agent q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) # Define training criterion as the Huber Loss function return huber_loss(q_targets, q_acted, 1.0) # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)
def __init__(self, state_dim, action_dim, gamma=0.99, learning_rate=1e-4, momentum=0.95): self.state_dim = state_dim self.action_dim = action_dim self.gamma = gamma with default_options(activation=relu, init=he_uniform()): # Convolution filter counts were halved to save on memory, no gpu :( self.model = Sequential([ Convolution2D((8, 8), 16, strides=4, name='conv1'), Convolution2D((4, 4), 32, strides=2, name='conv2'), Convolution2D((3, 3), 32, strides=1, name='conv3'), Dense(256, init=he_uniform(scale=0.01), name='dense1'), Dense(action_dim, activation=None, init=he_uniform(scale=0.01), name='actions') ]) self.model.update_signature(Tensor[state_dim]) # Create the target model as a copy of the online model self.target_model = None self.update_target() self.pre_states = input_variable(state_dim, name='pre_states') self.actions = input_variable(action_dim, name='actions') self.post_states = input_variable(state_dim, name='post_states') self.rewards = input_variable((), name='rewards') self.terminals = input_variable((), name='terminals') self.is_weights = input_variable((), name='is_weights') predicted_q = reduce_sum(self.model(self.pre_states) * self.actions, axis=0) # DQN - calculate target q values # post_q = reduce_max(self.target_model(self.post_states), axis=0) # DDQN - calculate target q values online_selection = one_hot( argmax(self.model(self.post_states), axis=0), self.action_dim) post_q = reduce_sum(self.target_model(self.post_states) * online_selection, axis=0) post_q = (1.0 - self.terminals) * post_q target_q = stop_gradient(self.rewards + self.gamma * post_q) # Huber loss delta = 1.0 self.td_error = minus(predicted_q, target_q, name='td_error') abs_error = abs(self.td_error) errors = element_select(less(abs_error, delta), square(self.td_error) * 0.5, delta * (abs_error - 0.5 * delta)) loss = errors * self.is_weights # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_scheule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) self._learner = adam(self.model.parameters, lr_schedule, m_scheule, variance_momentum=vm_schedule) self.writer = TensorBoardProgressWriter(log_dir='metrics', model=self.model) self.trainer = Trainer(self.model, (loss, None), [self._learner], self.writer)