def Adam(cost, params, alpha=floatX(0.001), beta_1=floatX(0.9), beta_2=floatX(0.999), epsilon=floatX(1e-8)): ''' Follows the psuedo-code from ADAM: A METHOD FOR STOCHASTIC OPTIMIZATION http://arxiv.org/pdf/1412.6980v8.pdf ''' updates = [] t = theano.shared(value=floatX(1.), name='t') grads = T.grad(cost, params) alpha_t = alpha * T.sqrt(floatX(1.) - beta_2**t) / (floatX(1.) - beta_1**t) for param, gparam in zip(params, grads): value = param.get_value(borrow=True) # initialize first and second moment updates parameter-wise m = theano.shared(value=np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable, name='m') v = theano.shared(value=np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable, name='v') # update biased first/second moment estimates m_t = beta_1 * m + (floatX(1.) - beta_1) * gparam v_t = beta_2 * v + (floatX(1.) - beta_2) * T.sqr(gparam) # use the efficient update from sec. 2 of the paper to avoid # computing the unbiased estimates g_t = m_t / (T.sqrt(v_t) + epsilon) param_t = param - alpha_t * g_t # store changes to the shared variables updates.append((m, m_t)) updates.append((v, v_t)) updates.append((param, param_t)) updates.append((t, t + 1)) return updates
def __init__(self, dqn_mt, gamma=0.95, l2_reg=0.0, lr=1e-3, memory_size=250, minibatch_size=64, nn_num_batch=1, nn_num_iter=2, regularizer={}, update_freq=1, target_freq=10, skip_frame=0, frames_per_action=4, exploration_kwargs={ 'method': 'eps-greedy', 'epsilon': 0.1 }): ''' (TODO): task should be task info. we don't use all of task properties/methods here. only gamma and state dimension. and we allow task switching. ''' self.dqn = dqn_mt self.dqn_frozen = dqn_mt.copy() self.l2_reg = floatX(l2_reg) self.lr = floatX(lr) self.target_freq = target_freq self.update_freq = update_freq self.memory_size = memory_size self.minibatch_size = minibatch_size self.gamma = floatX(gamma) self.regularizer = regularizer self.skip_frame = skip_frame self.exploration_kwargs = exploration_kwargs self.frames_per_action = frames_per_action # for now, keep experience as a list of tuples self.experience = [] self.exp_idx = 0 self.total_exp = 0 # used for streaming updates self.last_state = None self.last_valid_actions = None self.last_action = None # params for nn optimization. self.nn_num_batch = nn_num_batch self.nn_num_iter = nn_num_iter # dianostics. self.diagnostics = { 'nn-error': [] # training of neural network on mini-batches. } # compile back-propagtion network self._compile_bp()
def _get_frame(self): if self.state_type == 'pixel': from scipy.misc import imresize img = self.curr_screen_rgb img = rgb2yuv(img)[:, :, 0] # get Y channel, according to Nature paper. img = imresize(img, (84, 84), interp='bicubic') return img / floatX(255.0) elif self.state_type == 'ram': return self._get_ram_state() elif self.state_type == '1hot': return self._get_1hot_state() else: raise NotImplementedError()
def curr_state(self): ''' return raw pixels. ''' return np.array(self.frames, dtype=floatX) / floatX(255.) # normalize