예제 #1
0
def Adam(cost, params, alpha=floatX(0.001), beta_1=floatX(0.9), beta_2=floatX(0.999), epsilon=floatX(1e-8)):
    '''
        Follows the psuedo-code from
            ADAM: A METHOD FOR STOCHASTIC OPTIMIZATION
                http://arxiv.org/pdf/1412.6980v8.pdf
    '''
    updates = []
    t = theano.shared(value=floatX(1.), name='t')
    grads = T.grad(cost, params)

    alpha_t = alpha * T.sqrt(floatX(1.) - beta_2**t) / (floatX(1.) - beta_1**t)
    for param, gparam in zip(params, grads):
        value = param.get_value(borrow=True)
        # initialize first and second moment updates parameter-wise
        m = theano.shared(value=np.zeros(value.shape, dtype=value.dtype),
                          broadcastable=param.broadcastable, name='m')
        v = theano.shared(value=np.zeros(value.shape, dtype=value.dtype),
                          broadcastable=param.broadcastable, name='v')

        # update biased first/second moment estimates
        m_t = beta_1 * m + (floatX(1.) - beta_1) * gparam
        v_t = beta_2 * v + (floatX(1.) - beta_2) * T.sqr(gparam)

        # use the efficient update from sec. 2 of the paper to avoid
        # computing the unbiased estimates
        g_t = m_t / (T.sqrt(v_t) + epsilon)
        param_t = param - alpha_t * g_t

        # store changes to the shared variables
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((param, param_t))

    updates.append((t, t + 1))
    return updates
예제 #2
0
    def __init__(self, dqn_mt, gamma=0.95, l2_reg=0.0, lr=1e-3,
               memory_size=250, minibatch_size=64,
               nn_num_batch=1, nn_num_iter=2, regularizer={},
               update_freq=1, target_freq=10, skip_frame=0,
               frames_per_action=4,
               exploration_kwargs={
                   'method': 'eps-greedy',
                   'epsilon': 0.1
               }):
        '''
        (TODO): task should be task info.
        we don't use all of task properties/methods here.
        only gamma and state dimension.
        and we allow task switching.
        '''
        self.dqn = dqn_mt
        self.dqn_frozen = dqn_mt.copy()
        self.l2_reg = floatX(l2_reg)
        self.lr = floatX(lr)
        self.target_freq = target_freq
        self.update_freq = update_freq
        self.memory_size = memory_size
        self.minibatch_size = minibatch_size
        self.gamma = floatX(gamma)
        self.regularizer = regularizer
        self.skip_frame = skip_frame
        self.exploration_kwargs = exploration_kwargs
        self.frames_per_action = frames_per_action

        # for now, keep experience as a list of tuples
        self.experience = []
        self.exp_idx = 0
        self.total_exp = 0

        # used for streaming updates
        self.last_state = None
        self.last_valid_actions = None
        self.last_action = None

        # params for nn optimization.
        self.nn_num_batch = nn_num_batch
        self.nn_num_iter = nn_num_iter

        # dianostics.
        self.diagnostics = {
            'nn-error': [] # training of neural network on mini-batches.
        }

        # compile back-propagtion network
        self._compile_bp()
예제 #3
0
 def _get_frame(self):
     if self.state_type == 'pixel':
         from scipy.misc import imresize
         img = self.curr_screen_rgb
         img = rgb2yuv(img)[:, :, 0] # get Y channel, according to Nature paper.
         img = imresize(img, (84, 84), interp='bicubic')
         return img / floatX(255.0)
     elif self.state_type == 'ram':
         return self._get_ram_state()
     elif self.state_type == '1hot':
         return self._get_1hot_state()
     else:
         raise NotImplementedError()
예제 #4
0
 def curr_state(self):
     '''
     return raw pixels.
     '''
     return np.array(self.frames, dtype=floatX) / floatX(255.) # normalize