Python ReplayMemory.enqueue 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: replay_memory

클래스/타입: ReplayMemory

메소드/함수: enqueue

hotexamples.com에서의 예제들: 13

Python ReplayMemory.enqueue - 13개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 replay_memory.ReplayMemory.enqueue에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

ReplayMemory(30)

add(30)

append(21)

enqueue(11)

add_transition(3)

batch(3)

add_experience(3)

add_episode(3)

clear(2)

enable_gbl(2)

disable_gbl(2)

__len__(2)

add_memory(2)

dump(1)

debug(1)

current_stats(1)

count(1)

batch_ops(1)

can_provide_sample(1)

add_element(1)

append_storage_chunk(1)

add_state_transition(1)

add_recent_state(1)

add_recent_action(1)

add_all(1)

예제 #1

파일 보기

class ThreadTrainer(Thread):
    def __init__(self, server, id):
        super(ThreadTrainer, self).__init__()
        self.setDaemon(True)

        self.id = id
        self.server = server
        self.exit_flag = False

        self.rm = ReplayMemory(
            Config.TRAINING_REPLAY_MEMORY_SIZE,
            (Config.IMAGE_HEIGHT, Config.IMAGE_WIDTH, Config.STACKED_FRAMES),
            (24),
            dtype=np.__dict__['float32'])

    def run(self):
        while not self.exit_flag:
            o_, r_, a_, n_, d_ = self.server.training_q.get()
            for i in range(o_.shape[0]):
                self.rm.enqueue(o_[i, ...], a_[i, ...], r_[i], n_[i, ...],
                                d_[i])

                if Config.TRAIN_MODELS and self.rm.i % 20 == 0 and self.rm.n > 200:
                    o__, a__, r__, n__, d__ = self.rm.minibatch(
                        size=Config.TRAINING_MIN_BATCH_SIZE)
                    self.server.train_model(o__, r__, a__, n__, d__, self.id)

예제 #2

파일 보기

def test_replay_memory():
    from replay_memory import ReplayMemory
    s = 100
    rm = ReplayMemory(s, 1, 1)

    for i in range(0, 100, 1):
        rm.enqueue(i, i % 3 == 0, i, i, i)

    for i in range(1000):
        o, a, r, o2, t2, info = rm.minibatch(10)
        assert all(o == o2 - 1), "error: o and o2"
        assert all(o != s - 1), "error: o wrap over rm. o = " + str(o)
        assert all(o2 != 0), "error: o2 wrap over rm"

예제 #3

파일 보기

class Agent:
    def __init__(self, dimO, dimA):
        dimA = list(dimA)
        dimO = list(dimO)
        self.dimA = dimA[0]
        self.dimO = dimO[0]

        tau = FLAGS.tau
        discount = FLAGS.discount
        l2norm = FLAGS.l2norm
        learning_rate = FLAGS.rate
        outheta = FLAGS.outheta
        ousigma = FLAGS.ousigma

        nets = icnn_nets_dm

        if FLAGS.icnn_opt == 'adam':
            self.opt = self.adam
        elif FLAGS.icnn_opt == 'bundle_entropy':
            self.opt = self.bundle_entropy
        else:
            raise RuntimeError("Unrecognized ICNN optimizer: " +
                               FLAGS.icnn_opt)

        def entropy(x):  #the real concave entropy function
            x_move_reg = tf.clip_by_value((x + 1) / 2, 0.0001, 0.9999)
            pen = x_move_reg * tf.log(x_move_reg) + (
                1 - x_move_reg) * tf.log(1 - x_move_reg)
            return -tf.reduce_sum(pen, 1)

        # init replay memory
        self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA)
        # start tf session
        self.sess = tf.Session(
            config=tf.ConfigProto(inter_op_parallelism_threads=FLAGS.thread,
                                  log_device_placement=False,
                                  allow_soft_placement=True,
                                  gpu_options=tf.GPUOptions(
                                      per_process_gpu_memory_fraction=0.1)))

        # create tf computational graph
        self.theta = nets.theta(dimO[0], dimA[0], FLAGS.l1size, FLAGS.l2size,
                                'theta')
        self.theta_t, update_t = exponential_moving_averages(self.theta, tau)

        obs = tf.placeholder(tf.float32, [1] + dimO, "obs")
        act_test = tf.placeholder(tf.float32, [1] + dimA, "act")

        # explore
        noise_init = tf.zeros([1] + dimA)
        noise_var = tf.Variable(noise_init, name="noise", trainable=False)
        self.ou_reset = noise_var.assign(noise_init)
        noise = noise_var.assign_sub((outheta) * noise_var - \
                                     tf.random_normal(dimA, stddev=ousigma))
        act_expl = act_test + noise

        # test, single sample q function & gradient for bundle method
        q_test_opt, _, _, _, _ = nets.qfunction(obs, act_test, self.theta,
                                                False, False)
        loss_test = -q_test_opt
        act_test_grad = tf.gradients(loss_test, act_test)[0]

        loss_test_entr = -q_test_opt - entropy(act_test)
        act_test_grad_entr = tf.gradients(loss_test_entr, act_test)[0]

        # batched q function & gradient for bundle method
        obs_train2_opt = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO,
                                        "obs_train2_opt")
        act_train2_opt = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA,
                                        "act_train2_opt")

        q_train2_opt, _, _, _, _ = nets.qfunction(obs_train2_opt,
                                                  act_train2_opt, self.theta_t,
                                                  True, False)
        loss_train2 = -q_train2_opt
        act_train2_grad = tf.gradients(loss_train2, act_train2_opt)[0]

        loss_train2_entr = -q_train2_opt - entropy(act_train2_opt)
        act_train2_grad_entr = tf.gradients(loss_train2_entr,
                                            act_train2_opt)[0]

        # training
        obs_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO,
                                   "obs_train")
        act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA,
                                   "act_train")
        rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew")
        obs_train2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO,
                                    "obs_train2")
        act_train2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA,
                                    "act_train2")
        term2 = tf.placeholder(tf.bool, [FLAGS.bsize], "term2")

        q_train, q_train_z1, q_train_z2, q_train_u1, q_train_u2 = nets.qfunction(
            obs_train, act_train, self.theta, True, True)
        q_train_entropy = q_train + entropy(act_train)

        q_train2, _, _, _, _ = nets.qfunction(obs_train2, act_train2,
                                              self.theta_t, True, True)
        q_train2_entropy = q_train2 + entropy(act_train2)

        # q loss
        if FLAGS.icnn_opt == 'adam':
            q_target = tf.select(term2, rew, rew + discount * q_train2)
            q_target = tf.maximum(q_train - 1., q_target)
            q_target = tf.minimum(q_train + 1., q_target)
            q_target = tf.stop_gradient(q_target)
            td_error = q_train - q_target
        elif FLAGS.icnn_opt == 'bundle_entropy':
            q_target = tf.select(term2, rew, rew + discount * q_train2_entropy)
            q_target = tf.maximum(q_train_entropy - 1., q_target)
            q_target = tf.minimum(q_train_entropy + 1., q_target)
            q_target = tf.stop_gradient(q_target)
            td_error = q_train_entropy - q_target
        ms_td_error = tf.reduce_mean(tf.square(td_error), 0)
        theta = self.theta
        # TODO: Replace with something cleaner, this could easily stop working
        # if the variable names change.
        wd_q = tf.add_n([
            l2norm * tf.nn.l2_loss(var) if var.name[6] == 'W' else 0.
            for var in theta
        ])  # weight decay
        loss_q = ms_td_error + wd_q
        # q optimization
        optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate)
        grads_and_vars_q = optim_q.compute_gradients(loss_q)
        optimize_q = optim_q.apply_gradients(grads_and_vars_q)
        with tf.control_dependencies([optimize_q]):
            train_q = tf.group(update_t)

        summary_writer = tf.train.SummaryWriter(
            os.path.join(FLAGS.outdir, 'board'), self.sess.graph)
        summary_list = []
        if FLAGS.icnn_opt == 'adam':
            summary_list.append(
                tf.scalar_summary('Qvalue', tf.reduce_mean(q_train)))
        elif FLAGS.icnn_opt == 'bundle_entropy':
            summary_list.append(
                tf.scalar_summary('Qvalue', tf.reduce_mean(q_train_entropy)))
        summary_list.append(tf.scalar_summary('loss', ms_td_error))
        summary_list.append(tf.scalar_summary('reward', tf.reduce_mean(rew)))
        summary_list.append(
            tf.scalar_summary('cvx_z1', tf.reduce_mean(q_train_z1)))
        summary_list.append(
            tf.scalar_summary('cvx_z2', tf.reduce_mean(q_train_z2)))
        summary_list.append(
            tf.scalar_summary('cvx_z1_pos',
                              tf.reduce_mean(tf.to_float(q_train_z1 > 1e-15))))
        summary_list.append(
            tf.scalar_summary('cvx_z2_pos',
                              tf.reduce_mean(tf.to_float(q_train_z2 > 1e-15))))
        summary_list.append(
            tf.scalar_summary('noncvx_u1', tf.reduce_mean(q_train_u1)))
        summary_list.append(
            tf.scalar_summary('noncvx_u2', tf.reduce_mean(q_train_u2)))
        summary_list.append(
            tf.scalar_summary('noncvx_u1_pos',
                              tf.reduce_mean(tf.to_float(q_train_u1 > 1e-15))))
        summary_list.append(
            tf.scalar_summary('noncvx_u2_pos',
                              tf.reduce_mean(tf.to_float(q_train_u2 > 1e-15))))

        # tf functions
        with self.sess.as_default():
            self._reset = Fun([], self.ou_reset)
            self._act_expl = Fun(act_test, act_expl)
            self._train = Fun(
                [obs_train, act_train, rew, obs_train2, act_train2, term2],
                [train_q, loss_q], summary_list, summary_writer)

            self._opt_test = Fun([obs, act_test], [loss_test, act_test_grad])
            self._opt_train = Fun([obs_train2_opt, act_train2_opt],
                                  [loss_train2, act_train2_grad])
            self._opt_test_entr = Fun([obs, act_test],
                                      [loss_test_entr, act_test_grad_entr])
            self._opt_train_entr = Fun(
                [obs_train2_opt, act_train2_opt],
                [loss_train2_entr, act_train2_grad_entr])

        # initialize tf variables
        self.saver = tf.train.Saver(max_to_keep=1)
        ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf")
        if ckpt:
            self.saver.restore(self.sess, ckpt)
        else:
            self.sess.run(tf.initialize_all_variables())

        self.sess.graph.finalize()

        self.t = 0  # global training time (number of observations)

    def bundle_entropy(self, func, obs):
        act = np.ones((obs.shape[0], self.dimA)) * 0.5

        def fg(x):
            value, grad = func(obs, 2 * x - 1)
            grad *= 2
            return value, grad

        act = bundle_entropy.solveBatch(fg, act)[0]
        act = 2 * act - 1

        return act

    def adam(self, func, obs):
        b1 = 0.9
        b2 = 0.999
        lam = 0.5
        eps = 1e-8
        alpha = 0.01
        nBatch = obs.shape[0]
        act = np.zeros((nBatch, self.dimA))
        m = np.zeros_like(act)
        v = np.zeros_like(act)

        b1t, b2t = 1., 1.
        act_best, a_diff, f_best = [None] * 3
        for i in range(10000):
            f, g = func(obs, act)

            if i == 0:
                act_best = act.copy()
                f_best = f.copy()
            else:
                I = (f < f_best)
                act_best[I] = act[I]
                f_best[I] = f[I]

            m = b1 * m + (1. - b1) * g
            v = b2 * v + (1. - b2) * (g * g)
            b1t *= b1
            b2t *= b2
            mhat = m / (1. - b1t)
            vhat = v / (1. - b2t)

            prev_act = act.copy()
            act -= alpha * mhat / (np.sqrt(v) + eps)
            act = np.clip(act, -1, 1)

            a_diff_i = np.mean(np.linalg.norm(act - prev_act, axis=1))
            a_diff = a_diff_i if a_diff is None else lam * a_diff + (
                1. - lam) * a_diff_i
            # print(a_diff_i, a_diff, np.sum(f))
            if a_diff_i == 0 or a_diff < 1e-2:
                print('  + ADAM took {} iterations'.format(i))
                return act_best

        print('  + Warning: ADAM did not converge.')
        return act_best

    def reset(self, obs):
        self._reset()
        self.observation = obs  # initial observation

    def act(self, test=False):
        print('--- Selecting action, test={}'.format(test))
        obs = np.expand_dims(self.observation, axis=0)

        if FLAGS.icnn_opt == 'adam':
            # f = self._opt_test_entr
            f = self._opt_test
        elif FLAGS.icnn_opt == 'bundle_entropy':
            f = self._opt_test
        else:
            raise RuntimeError("Unrecognized ICNN optimizer: " +
                               FLAGS.icnn_opt)
        act = self.opt(f, obs)

        action = act if test else self._act_expl(act)
        action = np.clip(action, -1, 1)
        self.action = np.atleast_1d(np.squeeze(
            action, axis=0))  # TODO: remove this hack
        return self.action

    def observe(self, rew, term, obs2, test=False):
        obs1 = self.observation
        self.observation = obs2

        # train
        if not test:
            self.t = self.t + 1

            self.rm.enqueue(obs1, term, self.action, rew)

            if self.t > FLAGS.warmup:
                for i in xrange(FLAGS.iter):
                    loss = self.train()

    def train(self):
        obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize)
        if FLAGS.icnn_opt == 'adam':
            # f = self._opt_train_entr
            f = self._opt_train
        elif FLAGS.icnn_opt == 'bundle_entropy':
            f = self._opt_train
        else:
            raise RuntimeError("Unrecognized ICNN optimizer: " +
                               FLAGS.icnn_opt)
        print('--- Optimizing for training')
        act2 = self.opt(f, ob2)

        _, loss = self._train(obs,
                              act,
                              rew,
                              ob2,
                              act2,
                              term2,
                              log=FLAGS.summary,
                              global_step=self.t)
        return loss

    def __del__(self):
        self.sess.close()

예제 #4

파일 보기

class Agent:

    def __init__(self, dimO, dimA):
        dimA = list(dimA)
        dimO = list(dimO)

        tau = FLAGS.tau
        discount = FLAGS.discount
        l2norm = FLAGS.l2norm
        learning_rate = FLAGS.rate
        outheta = FLAGS.outheta
        ousigma = FLAGS.ousigma

        nets = naf_nets_dm

        # init replay memory
        self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA)
        # start tf session
        self.sess = tf.Session(config=tf.ConfigProto(
            inter_op_parallelism_threads=FLAGS.thread,
            log_device_placement=False,
            allow_soft_placement=True,
            gpu_options=tf.GPUOptions(allow_growth=True)))

        is_training = tf.placeholder(tf.bool)

        # create tf computational graph
        self.theta_L = nets.theta(dimO[0], dimA[0] * dimA[0], FLAGS.l1size, FLAGS.l2size, 'theta_L')
        self.theta_U = nets.theta(dimO[0], dimA[0], FLAGS.l1size, FLAGS.l2size, 'theta_U')
        self.theta_V = nets.theta(dimO[0], 1, FLAGS.l1size, FLAGS.l2size, 'theta_V')
        self.theta_Vt, update_Vt = exponential_moving_averages(self.theta_V, tau)

        obs_single = tf.placeholder(tf.float32, [1] + dimO, "obs-single")
        act_test = nets.ufunction(obs_single, self.theta_U, False, is_training)

        # explore
        noise_init = tf.zeros([1] + dimA)
        noise_var = tf.Variable(noise_init, name="noise", trainable=False)
        self.ou_reset = noise_var.assign(noise_init)
        noise = noise_var.assign_sub((outheta) * noise_var - tf.random_normal(dimA, stddev=ousigma))
        act_expl = act_test + noise

        # training
        obs_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, 'obs_train')
        act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train")
        rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew")
        obs2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs2")
        term2 = tf.placeholder(tf.bool, [FLAGS.bsize], "term2")
        # q
        lmat = nets.lfunction(obs_train, self.theta_L, False, is_training)
        uvalue = nets.ufunction(obs_train, self.theta_U, True, is_training)
        avalue = nets.afunction(act_train, lmat, uvalue, dimA[0])
        q_train = nets.qfunction(obs_train, avalue, self.theta_V, False, is_training)

        # q targets
        q2 = nets.qfunction(obs2, tf.constant([0.] * FLAGS.bsize),
                            self.theta_Vt, True, is_training)
        q_target = tf.stop_gradient(tf.select(term2, rew, rew + discount * q2))

        # q loss
        td_error = q_train - q_target
        ms_td_error = tf.reduce_mean(tf.square(td_error), 0)
        theta = self.theta_L + self.theta_U + self.theta_V
        wd_q = tf.add_n([l2norm * tf.nn.l2_loss(var) for var in theta])  # weight decay
        loss_q = ms_td_error + wd_q
        # q optimization
        optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-4)
        grads_and_vars_q = optim_q.compute_gradients(loss_q)
        optimize_q = optim_q.apply_gradients(grads_and_vars_q)
        with tf.control_dependencies([optimize_q]):
            train_q = tf.group(update_Vt)

        summary_writer = tf.train.SummaryWriter(os.path.join(FLAGS.outdir, 'board'), self.sess.graph)
        summary_list = []
        summary_list.append(tf.scalar_summary('Qvalue', tf.reduce_mean(q_train)))
        summary_list.append(tf.scalar_summary('loss', ms_td_error))
        summary_list.append(tf.scalar_summary('reward', tf.reduce_mean(rew)))

        # tf functions
        with self.sess.as_default():
            self._act_test = Fun([obs_single, is_training], act_test)
            self._act_expl = Fun([obs_single, is_training], act_expl)
            self._reset = Fun([], self.ou_reset)
            self._train = Fun([obs_train, act_train, rew, obs2, term2, is_training], [train_q, loss_q], summary_list, summary_writer)

        # initialize tf variables
        self.saver = tf.train.Saver(max_to_keep=1)
        ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf")
        if ckpt:
            self.saver.restore(self.sess, ckpt)
        else:
            self.sess.run(tf.initialize_all_variables())

        self.sess.graph.finalize()

        self.t = 0  # global training time (number of observations)

    def reset(self, obs):
        self._reset()
        self.observation = obs  # initial observation

    def act(self, test=False):
        obs = np.expand_dims(self.observation, axis=0)
        action = self._act_test(obs, False) if test else self._act_expl(obs, False)
        action = np.clip(action, -1, 1)
        self.action = np.atleast_1d(np.squeeze(action, axis=0))  # TODO: remove this hack
        return self.action

    def observe(self, rew, term, obs2, test=False):

        obs1 = self.observation
        self.observation = obs2

        # train
        if not test:
            self.t = self.t + 1
            self.rm.enqueue(obs1, term, self.action, rew)

            if self.t > FLAGS.warmup:
                for i in range(FLAGS.iter):
                    loss = self.train()

    def train(self):
        obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize)
        _, loss = self._train(obs, act, rew, ob2, term2, True, log=FLAGS.summary, global_step=self.t)
        return loss

    def __del__(self):
        self.sess.close()

예제 #5

파일 보기

파일: icnn.py 프로젝트: zhang9song/icnn

class Agent:
    def __init__(self, dimO, dimA):
        dimA, dimO = dimA[0], dimO[0]
        self.dimA = dimA
        self.dimO = dimO

        tau = FLAGS.tau
        discount = FLAGS.discount
        l2norm = FLAGS.l2norm
        learning_rate = FLAGS.rate
        outheta = FLAGS.outheta
        ousigma = FLAGS.ousigma

        if FLAGS.icnn_opt == 'adam':
            self.opt = self.adam
        elif FLAGS.icnn_opt == 'bundle_entropy':
            self.opt = self.bundle_entropy
        else:
            raise RuntimeError("Unrecognized ICNN optimizer: " +
                               FLAGS.icnn_opt)

        self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA)
        self.sess = tf.Session(
            config=tf.ConfigProto(inter_op_parallelism_threads=FLAGS.thread,
                                  log_device_placement=False,
                                  allow_soft_placement=True,
                                  gpu_options=tf.GPUOptions(
                                      allow_growth=True)))

        self.noise = np.zeros(self.dimA)

        obs = tf.placeholder(tf.float32, [None, dimO], "obs")
        act = tf.placeholder(tf.float32, [None, dimA], "act")
        rew = tf.placeholder(tf.float32, [None], "rew")
        with tf.variable_scope('q'):
            negQ = self.negQ(obs, act)
        negQ_entr = negQ - entropy(act)
        q = -negQ
        q_entr = -negQ_entr
        act_grad, = tf.gradients(negQ, act)
        act_grad_entr, = tf.gradients(negQ_entr, act)

        obs_target = tf.placeholder(tf.float32, [None, dimO], "obs_target")
        act_target = tf.placeholder(tf.float32, [None, dimA], "act_target")
        term_target = tf.placeholder(tf.bool, [None], "term_target")
        with tf.variable_scope('q_target'):
            negQ_target = self.negQ(obs_target, act_target)
        negQ_entr_target = negQ_target - entropy(act_target)
        act_target_grad, = tf.gradients(negQ_target, act_target)
        act_entr_target_grad, = tf.gradients(negQ_entr_target, act_target)
        q_target = -negQ_target
        q_target_entr = -negQ_entr_target

        if FLAGS.icnn_opt == 'adam':
            y = tf.select(term_target, rew, rew + discount * q_target_entr)
            y = tf.maximum(q_entr - 1., y)
            y = tf.minimum(q_entr + 1., y)
            y = tf.stop_gradient(y)
            td_error = q_entr - y
        elif FLAGS.icnn_opt == 'bundle_entropy':
            raise RuntimError("Needs checking.")
            q_target = tf.select(term2, rew, rew + discount * q2_entropy)
            q_target = tf.maximum(q_entropy - 1., q_target)
            q_target = tf.minimum(q_entropy + 1., q_target)
            q_target = tf.stop_gradient(q_target)
            td_error = q_entropy - q_target
        ms_td_error = tf.reduce_mean(tf.square(td_error), 0)

        regLosses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES,
                                      scope='q/')
        loss_q = ms_td_error + l2norm * tf.reduce_sum(regLosses)

        self.theta_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                        scope='q/')
        self.theta_cvx_ = [
            v for v in self.theta_ if 'proj' in v.name and 'W:' in v.name
        ]
        self.makeCvx = [v.assign(tf.abs(v)) for v in self.theta_cvx_]
        self.proj = [v.assign(tf.maximum(v, 0)) for v in self.theta_cvx_]
        # self.proj = [v.assign(tf.abs(v)) for v in self.theta_cvx_]

        self.theta_target_ = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope='q_target/')
        update_target = [
            theta_target_i.assign_sub(tau * (theta_target_i - theta_i))
            for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)
        ]

        optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate)
        grads_and_vars_q = optim_q.compute_gradients(loss_q)
        optimize_q = optim_q.apply_gradients(grads_and_vars_q)

        summary_writer = tf.train.SummaryWriter(
            os.path.join(FLAGS.outdir, 'board'), self.sess.graph)
        if FLAGS.icnn_opt == 'adam':
            tf.scalar_summary('Qvalue', tf.reduce_mean(q))
        elif FLAGS.icnn_opt == 'bundle_entropy':
            tf.scalar_summary('Qvalue', tf.reduce_mean(q_entr))
        tf.scalar_summary('loss', ms_td_error)
        tf.scalar_summary('reward', tf.reduce_mean(rew))
        merged = tf.merge_all_summaries()

        # tf functions
        with self.sess.as_default():
            self._train = Fun(
                [obs, act, rew, obs_target, act_target, term_target],
                [optimize_q, update_target, loss_q], merged, summary_writer)
            self._fg = Fun([obs, act], [negQ, act_grad])
            self._fg_target = Fun([obs_target, act_target],
                                  [negQ_target, act_target_grad])
            self._fg_entr = Fun([obs, act], [negQ_entr, act_grad_entr])
            self._fg_entr_target = Fun(
                [obs_target, act_target],
                [negQ_entr_target, act_entr_target_grad])

        # initialize tf variables
        self.saver = tf.train.Saver(max_to_keep=1)
        ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf")
        if ckpt:
            self.saver.restore(self.sess, ckpt)
        else:
            self.sess.run(tf.initialize_all_variables())
            self.sess.run(self.makeCvx)
            self.sess.run([
                theta_target_i.assign(theta_i) for theta_i, theta_target_i in
                zip(self.theta_, self.theta_target_)
            ])

        self.sess.graph.finalize()

        self.t = 0  # global training time (number of observations)

    def bundle_entropy(self, func, obs):
        act = np.ones((obs.shape[0], self.dimA)) * 0.5

        def fg(x):
            value, grad = func(obs, 2 * x - 1)
            grad *= 2
            return value, grad

        act = bundle_entropy.solveBatch(fg, act)[0]
        act = 2 * act - 1

        return act

    def adam(self, func, obs, plot=False):
        # if npr.random() < 1./20:
        #     plot = True
        b1 = 0.9
        b2 = 0.999
        lam = 0.5
        eps = 1e-8
        alpha = 0.01
        nBatch = obs.shape[0]
        act = np.zeros((nBatch, self.dimA))
        m = np.zeros_like(act)
        v = np.zeros_like(act)

        b1t, b2t = 1., 1.
        act_best, a_diff, f_best = [None] * 3
        hist = {'act': [], 'f': [], 'g': []}
        for i in range(1000):
            f, g = func(obs, act)
            if plot:
                hist['act'].append(act.copy())
                hist['f'].append(f)
                hist['g'].append(g)

            if i == 0:
                act_best = act.copy()
                f_best = f.copy()
            else:
                prev_act_best = act_best.copy()
                I = (f < f_best)
                act_best[I] = act[I]
                f_best[I] = f[I]
                a_diff_i = np.mean(
                    np.linalg.norm(act_best - prev_act_best, axis=1))
                a_diff = a_diff_i if a_diff is None \
                         else lam*a_diff + (1.-lam)*a_diff_i
                # print(a_diff_i, a_diff, np.sum(f))
                if a_diff < 1e-3 and i > 5:
                    print('  + Adam took {} iterations'.format(i))
                    if plot:
                        self.adam_plot(func, obs, hist)
                    return act_best

            m = b1 * m + (1. - b1) * g
            v = b2 * v + (1. - b2) * (g * g)
            b1t *= b1
            b2t *= b2
            mhat = m / (1. - b1t)
            vhat = v / (1. - b2t)

            act -= alpha * mhat / (np.sqrt(v) + eps)
            # act = np.clip(act, -1, 1)
            act = np.clip(act, -1. + 1e-8, 1. - 1e-8)

        print('  + Warning: Adam did not converge.')
        if plot:
            self.adam_plot(func, obs, hist)
        return act_best

    def adam_plot(self, func, obs, hist):
        hist['act'] = np.array(hist['act']).T
        hist['f'] = np.array(hist['f']).T
        hist['g'] = np.array(hist['g']).T
        if self.dimA == 1:
            xs = np.linspace(-1. + 1e-8, 1. - 1e-8, 100)
            ys = [func(obs[[0], :], [[xi]])[0] for xi in xs]
            fig = plt.figure()
            plt.plot(xs, ys)
            plt.plot(hist['act'][0, 0, :], hist['f'][0, :], label='Adam')
            plt.legend()
            fname = os.path.join(FLAGS.outdir, 'adamPlt.png')
            print("Saving Adam plot to {}".format(fname))
            plt.savefig(fname)
            plt.close(fig)
        elif self.dimA == 2:
            assert (False)
        else:
            xs = npr.uniform(-1., 1., (5000, self.dimA))
            ys = np.array([func(obs[[0], :], [xi])[0] for xi in xs])
            epi = np.hstack((xs, ys))
            pca = PCA(n_components=2).fit(epi)
            W = pca.components_[:, :-1]
            xs_proj = xs.dot(W.T)
            fig = plt.figure()

            X = Y = np.linspace(xs_proj.min(), xs_proj.max(), 100)
            Z = griddata(xs_proj[:, 0],
                         xs_proj[:, 1],
                         ys.ravel(),
                         X,
                         Y,
                         interp='linear')

            plt.contourf(X, Y, Z, 15)
            plt.colorbar()

            adam_x = hist['act'][:, 0, :].T
            adam_x = adam_x.dot(W.T)
            plt.plot(adam_x[:, 0], adam_x[:, 1], label='Adam', color='k')
            plt.legend()

            fname = os.path.join(FLAGS.outdir, 'adamPlt.png')
            print("Saving Adam plot to {}".format(fname))
            plt.savefig(fname)
            plt.close(fig)

    def reset(self, obs):
        self.noise = np.zeros(self.dimA)
        self.observation = obs  # initial observation

    def act(self, test=False):
        with self.sess.as_default():
            print('--- Selecting action, test={}'.format(test))
            obs = np.expand_dims(self.observation, axis=0)

            if FLAGS.icnn_opt == 'adam':
                f = self._fg_entr
                # f = self._fg
            elif FLAGS.icnn_opt == 'bundle_entropy':
                f = self._fg
            else:
                raise RuntimeError("Unrecognized ICNN optimizer: " +
                                   FLAGS.icnn_opt)

            tflearn.is_training(False)
            action = self.opt(f, obs)
            tflearn.is_training(not test)

            if not test:
                self.noise -= FLAGS.outheta*self.noise - \
                              FLAGS.ousigma*npr.randn(self.dimA)
                action += self.noise
            action = np.clip(action, -1, 1)

            self.action = np.atleast_1d(np.squeeze(action, axis=0))
            return self.action

    def observe(self, rew, term, obs2, test=False):
        obs1 = self.observation
        self.observation = obs2

        # train
        if not test:
            self.t = self.t + 1

            self.rm.enqueue(obs1, term, self.action, rew)

            if self.t > FLAGS.warmup:
                for i in range(FLAGS.iter):
                    loss = self.train()

    def train(self):
        with self.sess.as_default():
            obs, act, rew, ob2, term2, info = self.rm.minibatch(
                size=FLAGS.bsize)
            if FLAGS.icnn_opt == 'adam':
                # f = self._opt_train_entr
                f = self._fg_entr_target
                # f = self._fg_target
            elif FLAGS.icnn_opt == 'bundle_entropy':
                f = self._fg_target
            else:
                raise RuntimeError("Unrecognized ICNN optimizer: " +
                                   FLAGS.icnn_opt)
            print('--- Optimizing for training')
            tflearn.is_training(False)
            act2 = self.opt(f, ob2)
            tflearn.is_training(True)

            _, _, loss = self._train(obs,
                                     act,
                                     rew,
                                     ob2,
                                     act2,
                                     term2,
                                     log=FLAGS.summary,
                                     global_step=self.t)
            self.sess.run(self.proj)
            return loss

    def negQ(self, x, y, reuse=False):
        szs = [FLAGS.l1size, FLAGS.l2size]
        assert (len(szs) >= 1)
        fc = tflearn.fully_connected
        bn = tflearn.batch_normalization
        lrelu = tflearn.activations.leaky_relu

        if reuse:
            tf.get_variable_scope().reuse_variables()

        nLayers = len(szs)
        us = []
        zs = []
        z_zs = []
        z_ys = []
        z_us = []

        reg = 'L2'

        prevU = x
        for i in range(nLayers):
            with tf.variable_scope('u' + str(i)) as s:
                u = fc(prevU, szs[i], reuse=reuse, scope=s, regularizer=reg)
                if i < nLayers - 1:
                    u = tf.nn.relu(u)
                    if FLAGS.icnn_bn:
                        u = bn(u, reuse=reuse, scope=s, name='bn')
            variable_summaries(u, suffix='u{}'.format(i))
            us.append(u)
            prevU = u

        prevU, prevZ = x, y
        for i in range(nLayers + 1):
            sz = szs[i] if i < nLayers else 1
            z_add = []
            if i > 0:
                with tf.variable_scope('z{}_zu_u'.format(i)) as s:
                    zu_u = fc(prevU,
                              szs[i - 1],
                              reuse=reuse,
                              scope=s,
                              activation='relu',
                              bias=True,
                              regularizer=reg,
                              bias_init=tf.constant_initializer(1.))
                    variable_summaries(zu_u, suffix='zu_u{}'.format(i))
                with tf.variable_scope('z{}_zu_proj'.format(i)) as s:
                    z_zu = fc(tf.mul(prevZ, zu_u),
                              sz,
                              reuse=reuse,
                              scope=s,
                              bias=False,
                              regularizer=reg)
                    variable_summaries(z_zu, suffix='z_zu{}'.format(i))
                z_zs.append(z_zu)
                z_add.append(z_zu)

            with tf.variable_scope('z{}_yu_u'.format(i)) as s:
                yu_u = fc(prevU,
                          self.dimA,
                          reuse=reuse,
                          scope=s,
                          bias=True,
                          regularizer=reg,
                          bias_init=tf.constant_initializer(1.))
                variable_summaries(yu_u, suffix='yu_u{}'.format(i))
            with tf.variable_scope('z{}_yu'.format(i)) as s:
                z_yu = fc(tf.mul(y, yu_u),
                          sz,
                          reuse=reuse,
                          scope=s,
                          bias=False,
                          regularizer=reg)
                z_ys.append(z_yu)
                variable_summaries(z_yu, suffix='z_yu{}'.format(i))
            z_add.append(z_yu)

            with tf.variable_scope('z{}_u'.format(i)) as s:
                z_u = fc(prevU,
                         sz,
                         reuse=reuse,
                         scope=s,
                         bias=True,
                         regularizer=reg,
                         bias_init=tf.constant_initializer(0.))
                variable_summaries(z_u, suffix='z_u{}'.format(i))
            z_us.append(z_u)
            z_add.append(z_u)

            z = tf.add_n(z_add)
            variable_summaries(z, suffix='z{}_preact'.format(i))
            if i < nLayers:
                # z = tf.nn.relu(z)
                z = lrelu(z, alpha=FLAGS.lrelu)
                variable_summaries(z, suffix='z{}_act'.format(i))

            zs.append(z)
            prevU = us[i] if i < nLayers else None
            prevZ = z

        z = tf.reshape(z, [-1], name='energies')
        return z

    def __del__(self):
        self.sess.close()

예제 #6

파일 보기

파일: icnn.py 프로젝트: avisingh599/icnn

class Agent:

    def __init__(self, dimO, dimA):
        dimA = list(dimA)
        dimO = list(dimO)
        self.dimA = dimA[0]
        self.dimO = dimO[0]

        tau = FLAGS.tau
        discount = FLAGS.discount
        l2norm = FLAGS.l2norm
        learning_rate = FLAGS.rate
        outheta = FLAGS.outheta
        ousigma = FLAGS.ousigma

        nets = icnn_nets_dm

        # init replay memory
        self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA)
        # start tf session
        self.sess = tf.Session(config=tf.ConfigProto(
            inter_op_parallelism_threads=FLAGS.thread,
            log_device_placement=False,
            allow_soft_placement=True,
            gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.1)))

        # create tf computational graph
        self.theta = nets.theta(dimO[0], dimA[0], FLAGS.l1size, FLAGS.l2size, 'theta')
        self.theta_t, update_t = exponential_moving_averages(self.theta, tau)

        obs = tf.placeholder(tf.float32, [1] + dimO, "obs")
        act_test = tf.placeholder(tf.float32, [1] + dimA, "act")

        # explore
        noise_init = tf.zeros([1] + dimA)
        noise_var = tf.Variable(noise_init, name="noise", trainable=False)
        self.ou_reset = noise_var.assign(noise_init)
        noise = noise_var.assign_sub((outheta) * noise_var - tf.random_normal(dimA, stddev=ousigma))
        act_expl = act_test + noise

        # test, single sample q function & gradient for bundle method
        q_test_opt, cz1, cz2, cz3, _, _, _, _ = nets.qfunction(obs, act_test, self.theta)
        loss_test = -q_test_opt
        act_test_grad = tf.gradients(loss_test, act_test)[0]

        # batched q function & gradient for bundle method
        obs_train2_opt = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs_train2_opt")
        act_train2_opt = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train2_opt")

        q_train2_opt, cz1t, cz2t, cz3t, _, _, _, _ = nets.qfunction(obs_train2_opt, act_train2_opt, self.theta_t)
        loss_train2 = -q_train2_opt
        act_train2_grad = tf.gradients(loss_train2, act_train2_opt)[0]

        # training
        obs_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs_train")
        act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train")
        rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew")
        obs_train2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs_train2")
        act_train2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train2")
        term2 = tf.placeholder(tf.bool, [FLAGS.bsize], "term2")

        def entropy(x): #the real concave entropy function
            x_move_reg = tf.clip_by_value((x + 1) / 2, 0.0001, 0.9999)
            pen = x_move_reg * tf.log(x_move_reg) + (1 - x_move_reg) * tf.log(1 - x_move_reg)
            return -tf.reduce_sum(pen, 1)

        q_train, q_train_cz1, q_train_cz2, q_train_cz3, q_train_z1, q_train_z2, q_train_u1, q_train_u2 = nets.qfunction(obs_train, act_train, self.theta)
        q_train_entropy = q_train + entropy(act_train)

        q_train2, q_train2_cz1, q_train2_cz2, q_train2_cz3, _, _, _, _ = nets.qfunction(obs_train2, act_train2, self.theta_t)
        q_train2_entropy = q_train2 + entropy(act_train2)
        q_target = tf.stop_gradient(tf.select(term2, rew, rew + discount * q_train2_entropy))

        # q loss
        td_error = q_train_entropy - q_target
        ms_td_error = tf.reduce_mean(tf.square(td_error), 0)
        theta = self.theta
        wd_q = tf.add_n([l2norm * tf.nn.l2_loss(var) for var in theta])  # weight decay
        loss_q = ms_td_error + wd_q
        # q optimization
        optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate)
        grads_and_vars_q = optim_q.compute_gradients(loss_q)
        grads_and_vars_q_clip = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in grads_and_vars_q]
        optimize_q = optim_q.apply_gradients(grads_and_vars_q_clip)
        with tf.control_dependencies([optimize_q]):
            train_q = tf.group(update_t)


        summary_writer = tf.train.SummaryWriter(os.path.join(FLAGS.outdir, 'board'), self.sess.graph)
        summary_list = []
        summary_list.append(tf.scalar_summary('Qvalue', tf.reduce_mean(q_train_entropy)))
        summary_list.append(tf.scalar_summary('loss', ms_td_error))
        summary_list.append(tf.scalar_summary('reward', tf.reduce_mean(rew)))
        summary_list.append(tf.scalar_summary('cvx_z1', tf.reduce_mean(q_train_z1)))
        summary_list.append(tf.scalar_summary('cvx_z2', tf.reduce_mean(q_train_z2)))
        summary_list.append(tf.scalar_summary('cvx_z1_pos', tf.reduce_mean(tf.to_float(q_train_z1 > 0))))
        summary_list.append(tf.scalar_summary('cvx_z2_pos', tf.reduce_mean(tf.to_float(q_train_z2 > 0))))
        summary_list.append(tf.scalar_summary('noncvx_u1', tf.reduce_mean(q_train_u1)))
        summary_list.append(tf.scalar_summary('noncvx_u2', tf.reduce_mean(q_train_u2)))
        summary_list.append(tf.scalar_summary('noncvx_u1_pos', tf.reduce_mean(tf.to_float(q_train_u1 > 1e-15))))
        summary_list.append(tf.scalar_summary('noncvx_u2_pos', tf.reduce_mean(tf.to_float(q_train_u2 > 1e-15))))

        # tf functions
        with self.sess.as_default():
            self._cz = Fun([obs], [cz1, cz2, cz3])
            self._czt = Fun([obs_train2_opt], [cz1t, cz2t, cz3t])
            self._reset = Fun([], self.ou_reset)
            self._act_expl = Fun(act_test, act_expl)
            self._train = Fun([obs_train, act_train, rew, obs_train2, act_train2, term2], [train_q, loss_q], summary_list, summary_writer)

            self._opt_test = Fun([act_test, cz1, cz2, cz3], [loss_test, act_test_grad])
            self._opt_train = Fun([act_train2_opt, cz1t, cz2t, cz3t], [loss_train2, act_train2_grad])

        # initialize tf variables
        self.saver = tf.train.Saver(max_to_keep=1)
        ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf")
        if ckpt:
            self.saver.restore(self.sess, ckpt)
        else:
            self.sess.run(tf.initialize_all_variables())

        self.sess.graph.finalize()

        self.t = 0  # global training time (number of observations)

    def get_cvx_opt(self, func, cz1, cz2, cz3):
        act = np.ones((cz1.shape[0], self.dimA)) * 0.5
        def fg(x):
            value, grad = func(2 * x - 1, cz1, cz2, cz3)
            grad *= 2
            return value, grad

        act = bundle_entropy.solveBatch(fg, act)[0]
        act = 2 * act - 1

        return act


    def reset(self, obs):
        self._reset()
        self.observation = obs  # initial observation

    def act(self, test=False):
        obs = np.expand_dims(self.observation, axis=0)
        cz1, cz2, cz3 = self._cz(obs)
        act = self.get_cvx_opt(self._opt_test, cz1, cz2, cz3)
        action = act if test else self._act_expl(act)
        action = np.clip(action, -1, 1)
        self.action = np.atleast_1d(np.squeeze(action, axis=0))  # TODO: remove this hack
        return self.action

    def observe(self, rew, term, obs2, test=False):

        obs1 = self.observation
        self.observation = obs2

        # train
        if not test:
            self.t = self.t + 1

            self.rm.enqueue(obs1, term, self.action, rew)

            if self.t > FLAGS.warmup:
                for i in xrange(FLAGS.iter):
                    loss = self.train()

    def train(self):
        obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize)
        cz1t, cz2t, cz3t = self._czt(ob2)
        act2 = self.get_cvx_opt(self._opt_train, cz1t, cz2t, cz3t)

        _, loss = self._train(obs, act, rew, ob2, act2, term2, log=FLAGS.summary, global_step=self.t)
        return loss

    def __del__(self):
        self.sess.close()

예제 #7

파일 보기

파일: ddpg.py 프로젝트: febert/DeepRL

class Agent:
    def __init__(self, dimO, dimA):
        dimA = list(dimA)
        dimO = list(dimO)

        nets = nets_dm

        # init replay memory
        self.rm = ReplayMemory(rm_size, dimO, dimA, dtype=np.__dict__[rm_dtype])

        # own replay memory
        self.replay_memory = deque(maxlen=rm_size)

        # start tf session
        self.sess = tf.Session(config=tf.ConfigProto(
            inter_op_parallelism_threads=threads,
            log_device_placement=False,
            allow_soft_placement=True))

        # create tf computational graph
        #
        self.theta_p = nets.theta_p(dimO, dimA)
        self.theta_q = nets.theta_q(dimO, dimA)
        self.theta_pt, update_pt = exponential_moving_averages(self.theta_p, tau)
        self.theta_qt, update_qt = exponential_moving_averages(self.theta_q, tau)

        obs = tf.placeholder(tf.float32, [None] + dimO, "obs")
        act_test, sum_p = nets.policy(obs, self.theta_p)

        # explore
        noise_init = tf.zeros([1] + dimA)
        noise_var = tf.Variable(noise_init)
        self.ou_reset = noise_var.assign(noise_init)
        noise = noise_var.assign_sub((ou_theta) * noise_var - tf.random_normal(dimA, stddev=ou_sigma))
        act_expl = act_test + noise

        # test
        q, sum_q = nets.qfunction(obs, act_test, self.theta_q, name= 'q_mu_of_s')
        # training
        # policy loss
        meanq = tf.reduce_mean(q, 0)
        wd_p = tf.add_n([pl2 * tf.nn.l2_loss(var) for var in self.theta_p])  # weight decay
        loss_p = -meanq + wd_p
        # policy optimization
        optim_p = tf.train.AdamOptimizer(learning_rate=lrp)
        grads_and_vars_p = optim_p.compute_gradients(loss_p, var_list=self.theta_p)
        optimize_p = optim_p.apply_gradients(grads_and_vars_p)
        with tf.control_dependencies([optimize_p]):
            train_p = tf.group(update_pt)

        # q optimization
        act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train")
        rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew")
        obs2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs2")
        term2 = tf.placeholder(tf.bool, [FLAGS.bsize], "term2")
        # q
        q_train, sum_qq = nets.qfunction(obs, act_train, self.theta_q, name= 'qs_a')
        # q targets
        act2, sum_p2 = nets.policy(obs2, theta=self.theta_pt)
        q2, sum_q2 = nets.qfunction(obs2, act2, theta=self.theta_qt, name='qsprime_aprime')
        q_target = tf.stop_gradient(tf.select(term2, rew, rew + discount * q2))
        # q_target = tf.stop_gradient(rew + discount * q2)
        # q loss
        td_error = q_train - q_target
        ms_td_error = tf.reduce_mean(tf.square(td_error), 0)
        wd_q = tf.add_n([ql2 * tf.nn.l2_loss(var) for var in self.theta_q])  # weight decay
        loss_q = ms_td_error + wd_q
        # q optimization
        optim_q = tf.train.AdamOptimizer(learning_rate=lrq)
        grads_and_vars_q = optim_q.compute_gradients(loss_q, var_list=self.theta_q)
        optimize_q = optim_q.apply_gradients(grads_and_vars_q)
        with tf.control_dependencies([optimize_q]):
            train_q = tf.group(update_qt)

        # logging
        log_obs = [] if dimO[0] > 20 else [tf.histogram_summary("obs/" + str(i), obs[:, i]) for i in range(dimO[0])]
        log_act = [] if dimA[0] > 20 else [tf.histogram_summary("act/inf" + str(i), act_test[:, i]) for i in
                                           range(dimA[0])]
        log_act2 = [] if dimA[0] > 20 else [tf.histogram_summary("act/train" + str(i), act_train[:, i]) for i in
                                            range(dimA[0])]
        log_misc = [sum_p, sum_qq, tf.histogram_summary("td_error", td_error)]
        log_grad = [grad_histograms(grads_and_vars_p), grad_histograms(grads_and_vars_q)]
        log_noise = [tf.histogram_summary('noise', noise_var)]
        log_train = log_obs + log_act + log_act2 + log_misc + log_grad + log_noise

        merged = tf.merge_all_summaries()
        # initialize tf log writer
        self.writer = tf.train.SummaryWriter(FLAGS.outdir + "/tf", self.sess.graph, flush_secs=20)

        # init replay memory for recording episodes
        max_ep_length = 10000
        self.rm_log = ReplayMemory(max_ep_length, dimO, dimA, rm_dtype)

        # tf functions
        with self.sess.as_default():
            self.act_test = Fun(obs, act_test)
            self._act_expl = Fun(obs, act_expl)
            self._reset = Fun([], self.ou_reset)
            self._train_q = Fun([obs, act_train, rew, obs2, term2], [train_q], log_train, self.writer)
            self._train_p = Fun([obs], [train_p])
            self._train_p = Fun([obs], [train_p], log_obs, self.writer)
            self._train = Fun([obs, act_train, rew, obs2, term2], [train_p, train_q], merged, self.writer)

        # initialize tf variables
        self.saver = tf.train.Saver(max_to_keep=1)
        ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf")
        if ckpt:
            self.saver.restore(self.sess, ckpt)
        else:
            self.sess.run(tf.initialize_all_variables())

        self.sess.graph.finalize()

        self.t = 0  # global training time (number of observations)

    def reset(self, obs):
        self._reset()
        self.observation = obs  # initial observation

    def act(self, test=False):
        obs = np.expand_dims(self.observation, axis=0)
        action = self.act_test(obs) if test else self._act_expl(obs)
        self.action = np.atleast_1d(np.squeeze(action, axis=0))  # TODO: remove this hack
        return self.action

    def observe(self, rew, term, obs2, test=False, perform_trainstep= True):

        obs1 = self.observation
        self.observation = obs2

        # train
        if not test:
            self.t = self.t + 1
            self.rm.enqueue(obs1, term, self.action, rew)
            self.replay_memory.append((obs1, self.action, rew, obs2, term))

            if self.t > FLAGS.warmup:
                # print('warmed up')
                if perform_trainstep: self.train()

            # elif FLAGS.warmq and self.rm.n > 1000:
            #     # Train Q on warmup
            #     obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize)
            #     self._train_q(obs, act, rew, ob2, term2, log=(np.random.rand() < FLAGS.log), global_step=self.t)

                # save parameters etc.
                # if (self.t+45000) % 50000 == 0: # TODO: correct
                #   s = self.saver.save(self.sess,FLAGS.outdir+"f/tf/c",self.t)
                #   print("DDPG Checkpoint: " + s)

    def train(self):
        # obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize)
        obs, act, rew, ob2, term2, = self.get_train_batch()
        log = (np.random.rand() < FLAGS.log)

        if FLAGS.async:
            self._train(obs, act, rew, ob2, term2, log=log, global_step=self.t)
        else:
            self._train_q(obs, act, rew, ob2, term2, log=log, global_step=self.t)
            self._train_p(obs, log=log, global_step=self.t)

    def write_scalar(self, tag, val):
        s = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=val)])
        self.writer.add_summary(s, self.t)

    def __del__(self):
        self.sess.close()

    def get_train_batch(self):

        #selecting transitions randomly from the replay memory:
        indices =  np.random.randint(0, len(self.replay_memory), [FLAGS.bsize])
        transition_batch = [self.replay_memory[i] for i in indices]

        states = np.asarray([transition_batch[i][0].squeeze() for i in range(FLAGS.bsize)])
        actions = np.asarray([transition_batch[i][1] for i in range(FLAGS.bsize)])
        rewards = np.asarray([transition_batch[i][2] for i in range(FLAGS.bsize)])
        states_prime = np.asarray([transition_batch[i][3].squeeze() for i in range(FLAGS.bsize)])
        term2 = np.asarray([transition_batch[i][4] for i in range(FLAGS.bsize)])

        return states, actions, rewards, states_prime, term2

예제 #8

파일 보기

파일: ddpg.py 프로젝트: Gracedgl/ddpg-2

class Agent:
    def __init__(self, dimO, dimA):
        dimA = list(dimA)
        dimO = list(dimO)

        nets = nets_dm

        # init replay memory
        self.rm = ReplayMemory(rm_size,
                               dimO,
                               dimA,
                               dtype=np.__dict__[rm_dtype])
        # start tf session
        self.sess = tf.Session(
            config=tf.ConfigProto(inter_op_parallelism_threads=threads,
                                  log_device_placement=False,
                                  allow_soft_placement=True))

        # create tf computational graph
        #
        self.theta_p = nets.theta_p(dimO, dimA)
        self.theta_q = nets.theta_q(dimO, dimA)
        self.theta_pt, update_pt = exponential_moving_averages(
            self.theta_p, tau)
        self.theta_qt, update_qt = exponential_moving_averages(
            self.theta_q, tau)

        obs = tf.placeholder(tf.float32, [None] + dimO, "obs")
        act_test, sum_p = nets.policy(obs, self.theta_p)

        # explore
        noise_init = tf.zeros([1] + dimA)
        noise_var = tf.Variable(noise_init)
        self.ou_reset = noise_var.assign(noise_init)
        noise = noise_var.assign_sub(
            (FLAGS.ou_theta) * noise_var -
            tf.random_normal(dimA, stddev=FLAGS.ou_sigma))
        act_expl = act_test + noise

        # test
        q, sum_q = nets.qfunction(obs, act_test, self.theta_q)
        # training
        # policy loss
        meanq = tf.reduce_mean(q, 0)
        wd_p = tf.add_n([pl2 * tf.nn.l2_loss(var)
                         for var in self.theta_p])  # weight decay
        loss_p = -meanq + wd_p
        # policy optimization
        optim_p = tf.train.AdamOptimizer(learning_rate=lrp)
        grads_and_vars_p = optim_p.compute_gradients(loss_p,
                                                     var_list=self.theta_p)
        optimize_p = optim_p.apply_gradients(grads_and_vars_p)
        with tf.control_dependencies([optimize_p]):
            train_p = tf.group(update_pt)

        # q optimization
        act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA,
                                   "act_train")
        rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew")
        term = tf.placeholder(tf.bool, [FLAGS.bsize], "term")
        obs2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs2")
        # q
        q_train, sum_qq = nets.qfunction(obs, act_train, self.theta_q)
        # q targets
        act2, sum_p2 = nets.policy(obs2, theta=self.theta_pt)
        q2, sum_q2 = nets.qfunction(obs2, act2, theta=self.theta_qt)
        q_target = tf.stop_gradient(tf.select(term, rew, rew + discount * q2))
        # q_target = tf.stop_gradient(rew + discount * q2)
        # q loss
        td_error = q_train - q_target
        ms_td_error = tf.reduce_mean(tf.square(td_error), 0)
        wd_q = tf.add_n([ql2 * tf.nn.l2_loss(var)
                         for var in self.theta_q])  # weight decay
        loss_q = ms_td_error + wd_q
        # q optimization
        optim_q = tf.train.AdamOptimizer(learning_rate=lrq)
        grads_and_vars_q = optim_q.compute_gradients(loss_q,
                                                     var_list=self.theta_q)
        optimize_q = optim_q.apply_gradients(grads_and_vars_q)
        with tf.control_dependencies([optimize_q]):
            train_q = tf.group(update_qt)

        # logging
        log_obs = [] if dimO[0] > 20 else [
            tf.histogram_summary("obs/" + str(i), obs[:, i])
            for i in range(dimO[0])
        ]
        log_act = [] if dimA[0] > 20 else [
            tf.histogram_summary("act/inf" + str(i), act_test[:, i])
            for i in range(dimA[0])
        ]
        log_act2 = [] if dimA[0] > 20 else [
            tf.histogram_summary("act/train" + str(i), act_train[:, i])
            for i in range(dimA[0])
        ]
        log_misc = [sum_p, sum_qq, tf.histogram_summary("td_error", td_error)]
        log_grad = [
            grad_histograms(grads_and_vars_p),
            grad_histograms(grads_and_vars_q)
        ]
        log_train = log_obs + log_act + log_act2 + log_misc + log_grad

        # initialize tf log writer
        self.writer = tf.train.SummaryWriter(FLAGS.outdir + "/tf",
                                             self.sess.graph,
                                             flush_secs=20)

        # init replay memory for recording episodes
        max_ep_length = 10000
        self.rm_log = ReplayMemory(max_ep_length, dimO, dimA, rm_dtype)

        # tf functions
        with self.sess.as_default():
            self._act_test = Fun(obs, act_test)
            self._act_expl = Fun(obs, act_expl)
            self._reset = Fun([], self.ou_reset)
            self._train_q = Fun([obs, act_train, rew, term, obs2], [train_q],
                                log_train, self.writer)
            self._train_p = Fun([obs], [train_p], log_train, self.writer)
            self._train = Fun([obs, act_train, rew, term, obs2],
                              [train_p, train_q], log_train, self.writer)

        # initialize tf variables
        self.saver = tf.train.Saver(max_to_keep=1)
        ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf")
        if ckpt:
            self.saver.restore(self.sess, ckpt)
        else:
            self.sess.run(tf.initialize_all_variables())

        self.sess.graph.finalize()

        self.t = 0  # global training time (number of observations)

    def reset(self, obs):
        self._reset()
        self.observation = obs  # initial observation

    def act(self, test=False):
        obs = np.expand_dims(self.observation, axis=0)
        action = self._act_test(obs) if test else self._act_expl(obs)
        self.action = np.atleast_1d(np.squeeze(
            action, axis=0))  # TODO: remove this hack
        return self.action

    def observe(self, rew, term, obs2, test=False):

        obs1 = self.observation
        self.observation = obs2

        # train
        if not test:
            self.t = self.t + 1
            self.rm.enqueue(obs1, term, self.action, rew)

            if self.t > FLAGS.warmup:
                self.train()

            elif FLAGS.warmq and self.rm.n > 1000:
                # Train Q on warmup
                obs, act, rew, term, ob2, info = self.rm.minibatch(
                    size=FLAGS.bsize)
                self._train_q(obs,
                              act,
                              rew,
                              term,
                              ob2,
                              log=(np.random.rand() < FLAGS.log),
                              global_step=self.t)

            # save parameters etc.
            # if (self.t+45000) % 50000 == 0: # TODO: correct
            #   s = self.saver.save(self.sess,FLAGS.outdir+"f/tf/c",self.t)
            #   print("DDPG Checkpoint: " + s)

    def train(self):
        obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize)
        log = (np.random.rand() < FLAGS.log)

        if FLAGS. async:
            self._train(obs, act, rew, ob2, term2, log=log, global_step=self.t)
        else:
            self._train_q(obs,
                          act,
                          rew,
                          ob2,
                          term2,
                          log=log,
                          global_step=self.t)
            self._train_p(obs, log=log, global_step=self.t)

    def write_scalar(self, tag, val):
        s = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=val)])
        self.writer.add_summary(s, self.t)

    def __del__(self):
        self.sess.close()

예제 #9

파일 보기

파일: ddpg.py 프로젝트: avisingh599/icnn

class Agent(object):
    def __init__(self, dimO, dimA):
        dimA = list(dimA)
        dimO = list(dimO)
        if len(dimO) > 1:
            assert len(dimO) == 3
            self.use_conv = True
            nets = ddpg_convnets_dm
        else:
            self.use_conv = False
            nets = ddpg_nets_dm

        # init replay memory
        self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA)
        # start tf session
        self.sess = tf.Session(
            config=tf.ConfigProto(inter_op_parallelism_threads=FLAGS.thread,
                                  log_device_placement=False,
                                  allow_soft_placement=True,
                                  gpu_options=tf.GPUOptions(
                                      per_process_gpu_memory_fraction=0.1)))

        # Placeholders
        input_obs_dim = [None] + dimO

        obs = tf.placeholder(tf.float32, input_obs_dim, "obs")
        is_training = tf.placeholder(tf.bool, [], name='is_training')
        act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA,
                                   "act_train")
        rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew")
        obs2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs2")
        term2 = tf.placeholder(tf.bool, [FLAGS.bsize], "term2")

        summary_writer = tf.train.SummaryWriter(
            os.path.join(FLAGS.outdir, 'board'), self.sess.graph)
        summary_list = []
        summary_list.append(tf.scalar_summary('reward', tf.reduce_mean(rew)))

        self.setup_actor_critic(nets, dimO, dimA, obs, obs2, is_training, rew,
                                term2, act_train)
        summary_list.extend(self.actor.get_summary())
        summary_list.extend(self.critic.get_summary())

        # summary_list.append(tf.scalar_summary('Qvalue', tf.reduce_mean(q_train)))
        # summary_list.append(tf.scalar_summary('loss', ms_td_error))

        # tf functions
        with self.sess.as_default():
            train_outputs = self.actor.get_train_outputs(
            ) + self.critic.get_train_outputs()
            self._train = Fun([obs, act_train, rew, obs2, term2, is_training],
                              train_outputs, summary_list, summary_writer)

        # initialize tf variables
        self.saver = tf.train.Saver(max_to_keep=1)
        ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf")
        if ckpt:
            self.saver.restore(self.sess, ckpt)
        else:
            self.sess.run(tf.initialize_all_variables())

        self.sess.graph.finalize()

        self.t = 0  # global training time (number of observations)

    def setup_actor_critic(self, nets, dimO, dimA, obs, obs2, is_training, rew,
                           term2, act_train):
        self.actor = Actor(self.use_conv,
                           nets,
                           dimO,
                           dimA,
                           obs,
                           obs2,
                           is_training,
                           self.sess,
                           scope='actor')
        self.critic = Critic(self.use_conv,
                             nets,
                             dimO,
                             dimA,
                             obs,
                             obs2,
                             rew,
                             term2,
                             is_training,
                             act_train,
                             self.actor,
                             scope='critic')

        self.actor.compute_loss(self.critic, obs, is_training)

    def reset(self, obs):
        self.actor.reset()
        self.observation = obs  # initial observation

    def act(self, test=False):
        obs = np.expand_dims(self.observation, axis=0)
        action = self.actor.act(obs, test)
        action = np.clip(action, -1, 1)
        self.action = np.atleast_1d(np.squeeze(
            action, axis=0))  # TODO: remove this hack
        return self.action

    def observe(self, rew, term, obs2, test=False):

        obs1 = self.observation
        self.observation = obs2

        # train
        if not test:
            self.t = self.t + 1
            self.rm.enqueue(obs1, term, self.action, rew)

            if self.t > FLAGS.warmup:
                for i in xrange(FLAGS.iter):
                    loss = self.train()

    def train(self):
        obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize)
        _, _, loss = self._train(obs,
                                 act,
                                 rew,
                                 ob2,
                                 term2,
                                 True,
                                 log=FLAGS.summary,
                                 global_step=self.t)
        return loss

    def __del__(self):
        self.sess.close()

예제 #10

파일 보기

파일: ddpg.py 프로젝트: justinjfu/icnn

class Agent:
    def __init__(self, dimO, dimA):
        dimA = list(dimA)
        dimO = list(dimO)

        nets = ddpg_nets_dm

        tau = FLAGS.tau
        discount = FLAGS.discount
        pl2norm = FLAGS.pl2norm
        l2norm = FLAGS.l2norm
        plearning_rate = FLAGS.prate
        learning_rate = FLAGS.rate
        outheta = FLAGS.outheta
        ousigma = FLAGS.ousigma

        # init replay memory
        self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA)
        # start tf session
        self.sess = tf.Session(
            config=tf.ConfigProto(inter_op_parallelism_threads=FLAGS.thread,
                                  log_device_placement=False,
                                  allow_soft_placement=True,
                                  gpu_options=tf.GPUOptions(
                                      per_process_gpu_memory_fraction=0.1)))

        # create tf computational graph
        #
        self.theta_p = nets.theta_p(dimO, dimA, FLAGS.l1size, FLAGS.l2size)
        self.theta_q = nets.theta_q(dimO, dimA, FLAGS.l1size, FLAGS.l2size)
        self.theta_pt, update_pt = exponential_moving_averages(
            self.theta_p, tau)
        self.theta_qt, update_qt = exponential_moving_averages(
            self.theta_q, tau)

        obs = tf.placeholder(tf.float32, [None] + dimO, "obs")
        act_test = nets.policy(obs, self.theta_p)

        # explore
        noise_init = tf.zeros([1] + dimA)
        noise_var = tf.Variable(noise_init)
        self.ou_reset = noise_var.assign(noise_init)
        noise = noise_var.assign_sub((outheta) * noise_var -
                                     tf.random_normal(dimA, stddev=ousigma))
        act_expl = act_test + noise

        # test
        q = nets.qfunction(obs, act_test, self.theta_q)
        # training

        # q optimization
        act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA,
                                   "act_train")
        rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew")
        obs2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs2")
        term2 = tf.placeholder(tf.bool, [FLAGS.bsize], "term2")

        # policy loss
        act_train_policy = nets.policy(obs, self.theta_p)
        q_train_policy = nets.qfunction(obs, act_train_policy, self.theta_q)
        meanq = tf.reduce_mean(q_train_policy, 0)
        wd_p = tf.add_n([pl2norm * tf.nn.l2_loss(var)
                         for var in self.theta_p])  # weight decay
        loss_p = -meanq + wd_p
        # policy optimization
        optim_p = tf.train.AdamOptimizer(learning_rate=plearning_rate,
                                         epsilon=1e-4)
        grads_and_vars_p = optim_p.compute_gradients(loss_p,
                                                     var_list=self.theta_p)
        optimize_p = optim_p.apply_gradients(grads_and_vars_p)
        with tf.control_dependencies([optimize_p]):
            train_p = tf.group(update_pt)

        # q
        q_train = nets.qfunction(obs, act_train, self.theta_q)
        # q targets
        act2 = nets.policy(obs2, theta=self.theta_pt)
        q2 = nets.qfunction(obs2, act2, theta=self.theta_qt)
        q_target = tf.stop_gradient(tf.select(term2, rew, rew + discount * q2))
        # q_target = tf.stop_gradient(rew + discount * q2)
        # q loss
        td_error = q_train - q_target
        ms_td_error = tf.reduce_mean(tf.square(td_error), 0)
        wd_q = tf.add_n([l2norm * tf.nn.l2_loss(var)
                         for var in self.theta_q])  # weight decay
        loss_q = ms_td_error + wd_q
        # q optimization
        optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                         epsilon=1e-4)
        grads_and_vars_q = optim_q.compute_gradients(loss_q,
                                                     var_list=self.theta_q)
        optimize_q = optim_q.apply_gradients(grads_and_vars_q)
        with tf.control_dependencies([optimize_q]):
            train_q = tf.group(update_qt)

        summary_writer = tf.train.SummaryWriter(
            os.path.join(FLAGS.outdir, 'board'), self.sess.graph)
        summary_list = []
        summary_list.append(
            tf.scalar_summary('Qvalue', tf.reduce_mean(q_train)))
        summary_list.append(tf.scalar_summary('loss', ms_td_error))
        summary_list.append(tf.scalar_summary('reward', tf.reduce_mean(rew)))

        # tf functions
        with self.sess.as_default():
            self._act_test = Fun(obs, act_test)
            self._act_expl = Fun(obs, act_expl)
            self._reset = Fun([], self.ou_reset)
            self._train = Fun([obs, act_train, rew, obs2, term2],
                              [train_p, train_q, loss_q], summary_list,
                              summary_writer)

        # initialize tf variables
        self.saver = tf.train.Saver(max_to_keep=1)
        ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf")
        if ckpt:
            self.saver.restore(self.sess, ckpt)
        else:
            self.sess.run(tf.initialize_all_variables())

        self.sess.graph.finalize()

        self.t = 0  # global training time (number of observations)

    def reset(self, obs):
        self._reset()
        self.observation = obs  # initial observation

    def act(self, test=False):
        obs = np.expand_dims(self.observation, axis=0)
        action = self._act_test(obs) if test else self._act_expl(obs)
        action = np.clip(action, -1, 1)
        self.action = np.atleast_1d(np.squeeze(
            action, axis=0))  # TODO: remove this hack
        return self.action

    def observe(self, rew, term, obs2, test=False):

        obs1 = self.observation
        self.observation = obs2

        # train
        if not test:
            self.t = self.t + 1
            self.rm.enqueue(obs1, term, self.action, rew)

            if self.t > FLAGS.warmup:
                for i in xrange(FLAGS.iter):
                    loss = self.train()

    def train(self):
        obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize)
        _, _, loss = self._train(obs,
                                 act,
                                 rew,
                                 ob2,
                                 term2,
                                 log=FLAGS.summary,
                                 global_step=self.t)
        return loss

    def __del__(self):
        self.sess.close()

예제 #11

파일 보기

파일: ddpg.py 프로젝트: kwanyudam/ddpg

class Agent:
    def __init__(
            self,
            dimO,
            dimA,
            nets=nets_dm,
            tau=.001,  # fdsla
            discount=.99,
            pl2=.0,
            ql2=.01,
            lrp=.0001,
            lrq=.001,
            ou_theta=0.15,
            ou_sigma=0.2,
            rm_size=500000,
            rm_dtype='float32',
            mb_size=32,
            threads=4,
            **kwargs):
        dimA = list(dimA)
        dimO = list(dimO)

        # init replay memory
        self.rm = ReplayMemory(rm_size,
                               dimO,
                               dimA,
                               dtype=np.__dict__[rm_dtype])
        self.mb_size = mb_size
        # start tf session
        self.sess = tf.Session(
            config=tf.ConfigProto(inter_op_parallelism_threads=threads,
                                  log_device_placement=False,
                                  allow_soft_placement=True))

        # create tf computational graph
        #
        self.theta_p = nets.theta_p(dimO, dimA)
        self.theta_q = nets.theta_q(dimO, dimA)
        self.theta_pt, update_pt = exponential_moving_averages(
            self.theta_p, tau)
        self.theta_qt, update_qt = exponential_moving_averages(
            self.theta_q, tau)

        obs = tf.placeholder(tf.float32, [None] + dimO, "obs")
        act_test, sum_p = nets.policy(obs, self.theta_p)

        # explore
        noise_init = tf.zeros([1] + dimA)
        noise_var = tf.Variable(noise_init)
        self.ou_reset = noise_var.assign(noise_init)
        noise = noise_var.assign_sub((ou_theta) * noise_var -
                                     tf.random_normal(dimA, stddev=ou_sigma))
        act_expl = act_test + noise

        # test
        q, sum_q = nets.qfunction(obs, act_test, self.theta_q)

        # training
        # policy loss
        meanq = tf.reduce_mean(q, 0)
        wd_p = tf.add_n([pl2 * tf.nn.l2_loss(var)
                         for var in self.theta_p])  # weight decay
        loss_p = -meanq + wd_p
        # policy optimization
        optim_p = tf.train.AdamOptimizer(learning_rate=lrp)
        grads_and_vars_p = optim_p.compute_gradients(loss_p,
                                                     var_list=self.theta_p)
        optimize_p = optim_p.apply_gradients(grads_and_vars_p)
        with tf.control_dependencies([optimize_p]):
            train_p = tf.group(update_pt)

        # q optimization
        act_train = tf.placeholder(tf.float32, [None] + dimA, "act_train")
        rew = tf.placeholder(tf.float32, [None], "rew")
        obs2 = tf.placeholder(tf.float32, [None] + dimO, "obs2")
        term2 = tf.placeholder(tf.bool, [None], "term2")
        # q
        q, sum_qq = nets.qfunction(obs, act_train, self.theta_q)
        # q targets
        act2, sum_p2 = nets.policy(obs2, theta=self.theta_pt)
        q2, sum_q2 = nets.qfunction(obs2, act2, theta=self.theta_qt)
        q_target = tf.stop_gradient(tf.select(term2, rew, rew + discount * q2))
        # = tf.stop_gradient(rew + discount * q2)
        # q loss
        mb_td_error = tf.square(q - q_target)
        mean_td_error = tf.reduce_mean(mb_td_error, 0)
        wd_q = tf.add_n([ql2 * tf.nn.l2_loss(var)
                         for var in self.theta_q])  # weight decay
        loss_q = mean_td_error + wd_q
        # q optimization
        optim_q = tf.train.AdamOptimizer(learning_rate=lrq)
        grads_and_vars_q = optim_q.compute_gradients(loss_q,
                                                     var_list=self.theta_q)
        optimize_q = optim_q.apply_gradients(grads_and_vars_q)
        with tf.control_dependencies([optimize_q]):
            train_q = tf.group(update_qt)

        # logging
        log_obs = [] if dimO[0] > 20 else [
            tf.histogram_summary("obs/" + str(i), obs[:, i])
            for i in range(dimO[0])
        ]
        log_act = [] if dimA[0] > 20 else [
            tf.histogram_summary("act/inf" + str(i), act_test[:, i])
            for i in range(dimA[0])
        ]
        log_act2 = [] if dimA[0] > 20 else [
            tf.histogram_summary("act/train" + str(i), act_train[:, i])
            for i in range(dimA[0])
        ]
        log_misc = [
            sum_p, sum_qq,
            tf.histogram_summary("qfunction/td_error", mb_td_error)
        ]
        log_grad = [
            grad_histograms(grads_and_vars_p),
            grad_histograms(grads_and_vars_q)
        ]
        log_train = log_obs + log_act + log_act2 + log_misc + log_grad

        # initialize tf log writer
        self.writer = tf.train.SummaryWriter("./tf",
                                             self.sess.graph,
                                             flush_secs=20)

        # init replay memory for recording episodes
        max_ep_length = 10000
        self.rm_log = ReplayMemory(max_ep_length, dimO, dimA, rm_dtype)

        # tf functions
        with self.sess.as_default():
            self._act_test = Fun(obs, act_test)
            self._act_expl = Fun(obs, act_expl)
            self._reset = Fun([], self.ou_reset)
            self._train = Fun([obs, act_train, rew, obs2, term2],
                              [train_p, train_q], log_train, self.writer)

        # initialize tf variables
        self.saver = tf.train.Saver(max_to_keep=1)
        ckpt = tf.train.latest_checkpoint("./tf")
        if ckpt:
            self.saver.restore(self.sess, ckpt)
        else:
            self.sess.run(tf.initialize_all_variables())

        self.sess.graph.finalize()

        self.t = 0  # global training time (number of observations)

    def reset(self, obs):
        self._reset()
        self.observation = np.squeeze(obs)  # initial observation

    def act(self, test=False, logging=False):
        obs = np.expand_dims(self.observation, axis=0)
        action = self._act_test(obs) if test else self._act_expl(obs)
        self.action = np.atleast_1d(np.squeeze(
            action, axis=0))  # TODO: remove this hack
        return self.action

    def observe(self, rew, term, obs2, test=False):

        rew = self.reward(rew)  # internal reward # TODO: outsource

        if not test:
            self.t = self.t + 1
            self.rm.enqueue(self.observation, term, self.action, rew)

            # save parameters etc.
            if (self.t + 45000) % 50000 == 0:  # TODO: correct
                s = self.saver.save(self.sess, "./tf/c", self.t)
                print("DDPG Checkpoint: " + s)

        self.observation = np.squeeze(obs2)  # current observation <- obs2
        return rew

    def train(self, logging=False):
        obs, act, rew, obs2, term2, info = self.rm.minibatch(size=self.mb_size)
        self._train(obs,
                    act,
                    rew,
                    obs2,
                    term2,
                    log=logging,
                    global_step=self.t)

    def reward(self, external_reward, logging=False):
        """ calculate internal reward """

        ra = -.1 * np.mean(np.square(self.action))
        rint = external_reward + ra

        if logging:
            self.write_scalar('reward/ext', external_reward)
            self.write_scalar('reward/a', ra)
            self.write_scalar('reward/rint', rint)

        return rint

    def write_scalar(self, tag, val):
        s = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=val)])
        self.writer.add_summary(s, self.t)

    def __del__(self):
        self.sess.close()

예제 #12

파일 보기

파일: ddpg.py 프로젝트: kwanyudam/ddpg

class Agent:

  def __init__(self, dimO, dimA,
    nets=nets_dm,
    tau =.001, # fdsla
    discount =.99, 
    pl2 =.0, 
    ql2 =.01, 
    lrp =.0001, 
    lrq =.001, 
    ou_theta = 0.15, 
    ou_sigma = 0.2, 
    rm_size = 500000, 
    rm_dtype = 'float32',
    mb_size = 32,
    threads = 4,**kwargs):
    dimA = list(dimA)
    dimO = list(dimO)

    # init replay memory
    self.rm = ReplayMemory(rm_size, dimO, dimA, dtype=np.__dict__[rm_dtype])
    self.mb_size = mb_size
    # start tf session
    self.sess = tf.Session(config=tf.ConfigProto(
      inter_op_parallelism_threads=threads,
      log_device_placement=False,
      allow_soft_placement=True))

    # create tf computational graph
    #
    self.theta_p = nets.theta_p(dimO, dimA)
    self.theta_q = nets.theta_q(dimO, dimA)
    self.theta_pt, update_pt = exponential_moving_averages(
      self.theta_p, tau)
    self.theta_qt, update_qt = exponential_moving_averages(
      self.theta_q, tau)

    obs = tf.placeholder(tf.float32, [None] + dimO, "obs")
    act_test, sum_p = nets.policy(obs, self.theta_p)

    # explore
    noise_init = tf.zeros([1]+dimA)
    noise_var = tf.Variable(noise_init)
    self.ou_reset = noise_var.assign(noise_init)
    noise = noise_var.assign_sub(
      (ou_theta) * noise_var - tf.random_normal(dimA, stddev=ou_sigma))
    act_expl = act_test + noise

    # test
    q, sum_q = nets.qfunction(obs, act_test, self.theta_q)

    # training
    # policy loss
    meanq = tf.reduce_mean(q, 0)
    wd_p = tf.add_n([pl2 * tf.nn.l2_loss(var)
             for var in self.theta_p])  # weight decay
    loss_p = -meanq + wd_p
    # policy optimization
    optim_p = tf.train.AdamOptimizer(learning_rate=lrp)
    grads_and_vars_p = optim_p.compute_gradients(
      loss_p, var_list=self.theta_p)
    optimize_p = optim_p.apply_gradients(grads_and_vars_p)
    with tf.control_dependencies([optimize_p]):
      train_p = tf.group(update_pt)

    # q optimization
    act_train = tf.placeholder(tf.float32, [None] + dimA, "act_train")
    rew = tf.placeholder(tf.float32, [None], "rew")
    obs2 = tf.placeholder(tf.float32, [None] + dimO, "obs2")
    term2 = tf.placeholder(tf.bool, [None], "term2")
    # q
    q, sum_qq = nets.qfunction(obs, act_train, self.theta_q)
    # q targets
    act2, sum_p2 = nets.policy(obs2, theta=self.theta_pt)
    q2, sum_q2 = nets.qfunction(obs2, act2, theta=self.theta_qt)
    q_target = tf.stop_gradient(tf.select(term2,rew,rew + discount*q2))
    # = tf.stop_gradient(rew + discount * q2)
    # q loss
    mb_td_error = tf.square(q - q_target)
    mean_td_error = tf.reduce_mean(mb_td_error, 0)
    wd_q = tf.add_n([ql2 * tf.nn.l2_loss(var)
             for var in self.theta_q])  # weight decay
    loss_q = mean_td_error + wd_q
    # q optimization
    optim_q = tf.train.AdamOptimizer(learning_rate=lrq)
    grads_and_vars_q = optim_q.compute_gradients(
      loss_q, var_list=self.theta_q)
    optimize_q = optim_q.apply_gradients(grads_and_vars_q)
    with tf.control_dependencies([optimize_q]):
      train_q = tf.group(update_qt)

    # logging
    log_obs = [] if dimO[0]>20 else [tf.histogram_summary("obs/"+str(i),obs[:,i]) for i in range(dimO[0])]
    log_act = [] if dimA[0]>20 else [tf.histogram_summary("act/inf"+str(i),act_test[:,i]) for i in range(dimA[0])]
    log_act2 = [] if dimA[0]>20 else [tf.histogram_summary("act/train"+str(i),act_train[:,i]) for i in range(dimA[0])]
    log_misc = [sum_p, sum_qq, tf.histogram_summary("qfunction/td_error", mb_td_error)]
    log_grad = [grad_histograms(grads_and_vars_p), grad_histograms(grads_and_vars_q)]
    log_train = log_obs + log_act + log_act2 + log_misc + log_grad

    # initialize tf log writer
    self.writer = tf.train.SummaryWriter(
      "./tf", self.sess.graph, flush_secs=20)

    # init replay memory for recording episodes
    max_ep_length = 10000
    self.rm_log = ReplayMemory(max_ep_length,dimO,dimA,rm_dtype) 

    # tf functions
    with self.sess.as_default():
      self._act_test = Fun(obs,act_test)
      self._act_expl = Fun(obs,act_expl)
      self._reset = Fun([],self.ou_reset)
      self._train = Fun([obs,act_train,rew,obs2,term2],[train_p,train_q],log_train,self.writer)

    # initialize tf variables
    self.saver = tf.train.Saver(max_to_keep=1)
    ckpt = tf.train.latest_checkpoint("./tf")
    if ckpt:
      self.saver.restore(self.sess,ckpt)
    else:
      self.sess.run(tf.initialize_all_variables())

    self.sess.graph.finalize()

    self.t = 0  # global training time (number of observations)

  def reset(self, obs):
    self._reset()
    self.observation = np.squeeze(obs)  # initial observation

  def act(self, test=False, logging=False):
    obs = np.expand_dims(self.observation, axis=0)
    action = self._act_test(obs) if test else self._act_expl(obs)
    self.action = np.atleast_1d(np.squeeze(action, axis=0)) # TODO: remove this hack
    return self.action

  def observe(self, rew, term, obs2, test=False):
    
    rew = self.reward(rew) # internal reward # TODO: outsource

    if not test:
      self.t = self.t + 1
      self.rm.enqueue(self.observation, term, self.action, rew)

      # save parameters etc.
      if (self.t+45000) % 50000 == 0: # TODO: correct
        s = self.saver.save(self.sess,"./tf/c",self.t)
        print("DDPG Checkpoint: " + s)


    self.observation = np.squeeze(obs2)  # current observation <- obs2
    return rew

  def train(self, logging=False):
    obs, act, rew, obs2, term2, info = self.rm.minibatch(size=self.mb_size)
    self._train(obs,act,rew,obs2,term2,log=logging,global_step=self.t)


  def reward(self,external_reward,logging=False):
    """ calculate internal reward """

    ra = - .1 * np.mean(np.square(self.action))
    rint = external_reward + ra

    if logging:
      self.write_scalar('reward/ext',external_reward)
      self.write_scalar('reward/a',ra)
      self.write_scalar('reward/rint',rint)

    return rint

  def write_scalar(self,tag,val):
    s = tf.Summary(value=[tf.Summary.Value(tag=tag,simple_value=val)])
    self.writer.add_summary(s,self.t)

  def __del__(self):
    self.sess.close()

예제 #13

파일 보기

파일: ddpg.py 프로젝트: zhq99/ucl-drl-msc

class Agent:
    """
  DDPG Agent
  """

    started_train = False

    def __init__(self, dimO, dimA, custom_policy=False, env_dtype=tf.float32):
        dimA = list(dimA)
        dimO = list(dimO)

        nets = nets_dm

        self.custom_policy = custom_policy

        # init replay memory
        self.rm = ReplayMemory(rm_size,
                               dimO,
                               dimA,
                               dtype=np.__dict__[env_dtype])
        # start tf session
        self.sess = tf.Session(
            config=tf.ConfigProto(inter_op_parallelism_threads=threads,
                                  log_device_placement=False,
                                  allow_soft_placement=True))

        # create tf computational graph
        #
        self.theta_p = nets.theta_p(dimO, dimA)
        self.theta_q = nets.theta_q(dimO, dimA)
        self.theta_pt, update_pt = exponential_moving_averages(
            self.theta_p, tau)
        self.theta_qt, update_qt = exponential_moving_averages(
            self.theta_q, tau)

        obs = tf.placeholder(env_dtype, [None] + dimO, "obs")
        is_training = tf.placeholder(tf.bool, name="is_training")
        # act_test, sum_p = nets.policy(obs, self.theta_p)
        act_test, sum_p = nets.policy(
            obs, self.theta_p) if not FLAGS.batch_norm else nets.policy_norm(
                obs, self.theta_p, is_training)

        # explore
        noise_init = tf.zeros([1] + dimA, dtype=env_dtype)
        noise_var = tf.Variable(noise_init)
        self.ou_reset = noise_var.assign(noise_init)
        noise = noise_var.assign_sub(
            (ou_theta) * noise_var -
            tf.random_normal(dimA, stddev=ou_sigma, dtype=env_dtype))
        act_expl = act_test + noise

        # for Wolpertinger full policy
        act_cont = tf.placeholder(env_dtype, [None] + dimA,
                                  "action_cont_space")

        # g_actions = tf.placeholder(env_dtype, [FLAGS.knn] + dimA, "knn_actions")
        g_actions = tf.placeholder(env_dtype, [None] + dimA, "knn_actions")

        # rew_g = tf.placeholder(env_dtype, [FLAGS.knn] + dimA, "rew")

        # rew_g = tf.placeholder(env_dtype, [FLAGS.knn], "rew_g")
        # term_g = tf.placeholder(tf.bool, [FLAGS.knn], "term_g")
        rew_g = tf.placeholder(env_dtype, [1], "rew_g")
        term_g = tf.placeholder(tf.bool, [1], "term_g")

        # g_dot_f = tf.mul(g_actions, act_cont, "g_dot_f")
        g_dot_f = g_actions
        q_eval, _ = nets.qfunction(
            obs, g_dot_f,
            self.theta_q) if not FLAGS.batch_norm else nets.qfunction_norm(
                obs, g_dot_f, self.theta_q, is_training, reuse=True)
        # wolpertinger_policy = tf.stop_gradient( tf.argmax( tf.select(term_g, rew_g, rew_g + discount * q_eval),
        #                                                        dimension=0, name="q_max") )
        wolpertinger_policy = tf.stop_gradient(
            tf.select(term_g, rew_g, rew_g + discount * q_eval))

        # test
        # q, sum_q = nets.qfunction(obs, act_test, self.theta_q)
        q, sum_q = nets.qfunction(
            obs, act_test,
            self.theta_q) if not FLAGS.batch_norm else nets.qfunction_norm(
                obs, act_test, self.theta_q, is_training)
        # training
        # policy loss
        meanq = tf.reduce_mean(q, 0)
        wd_p = tf.add_n([pl2 * tf.nn.l2_loss(var)
                         for var in self.theta_p])  # weight decay
        loss_p = -meanq + wd_p  #???
        # policy optimization
        optim_p = tf.train.AdamOptimizer(learning_rate=lrp)
        grads_and_vars_p = optim_p.compute_gradients(loss_p,
                                                     var_list=self.theta_p)
        optimize_p = optim_p.apply_gradients(grads_and_vars_p)
        with tf.control_dependencies([optimize_p]):
            train_p = tf.group(update_pt)

        # q optimization
        act_train = tf.placeholder(env_dtype, [FLAGS.bsize] + dimA,
                                   "act_train")
        g_act_train = tf.placeholder(env_dtype, [FLAGS.bsize] + dimA,
                                     "g_act_train")
        rew = tf.placeholder(env_dtype, [FLAGS.bsize], "rew")
        obs2 = tf.placeholder(env_dtype, [FLAGS.bsize] + dimO, "obs2")
        term2 = tf.placeholder(tf.bool, [FLAGS.bsize], "term2")

        # FOR WOLPERTINGER FUNCTIONALITY: eval wheter the agent is using pure DDPG or DDPG + Wolpertinger
        tensor_cond = tf.constant(self.custom_policy,
                                  dtype=tf.bool,
                                  name="is_custom_p")

        # full_act_policy = tf.cond(tensor_cond,
        #                           # lambda: tf.mul(g_act_train, act_train, name="full_act_policy"),
        #                           lambda: g_act_train,
        #                           lambda: act_train,
        #                           )

        # q
        # q_train, sum_qq = nets.qfunction(obs, act_train, self.theta_q)

        # TAKING THE POLICY GRADIENT AT THE ACTUAL OUTPUT OF f
        q_train, sum_qq = nets.qfunction(obs, act_train, self.theta_q) if not FLAGS.batch_norm else \
          nets.qfunction_norm(obs, act_train, self.theta_q, is_training, reuse=True)
        # q_train, sum_qq = nets.qfunction(obs, full_act_policy, self.theta_q) if not FLAGS.batch_norm else \
        #   nets.qfunction_norm(obs, full_act_policy, self.theta_q, is_training, reuse=True)

        # q targets
        # act2, sum_p2 = nets.policy(obs2, theta=self.theta_pt)
        act2, sum_p2 = nets.policy(
            obs2,
            theta=self.theta_pt) if not FLAGS.batch_norm else nets.policy_norm(
                obs2, theta=self.theta_pt, is_training=is_training, reuse=True)

        # WOLPERTINGER FUNCTIONALITY: The target action in the Q-update is generated by the full policy and not simply f
        full_act_policy2 = tf.cond(
            tensor_cond,
            # lambda: tf.mul(g_act_train, act2, name="full_act_policy"),
            lambda: g_act_train,
            lambda: act2,
        )
        # q2, sum_q2 = nets.qfunction(obs2, act2, theta=self.theta_qt)
        # q2, sum_q2 = nets.qfunction(obs2, act2, theta=self.theta_qt) if not FLAGS.batch_norm else nets.qfunction_norm(obs2, act2, theta=self.theta_qt, is_training=is_training, reuse=True)
        q2, sum_q2 = nets.qfunction(
            obs2, full_act_policy2, theta=self.theta_qt
        ) if not FLAGS.batch_norm else nets.qfunction_norm(
            obs2,
            full_act_policy2,
            theta=self.theta_qt,
            is_training=is_training,
            reuse=True)
        q_target = tf.stop_gradient(tf.select(term2, rew, rew + discount * q2))
        # q_target = tf.stop_gradient(rew + discount * q2)
        # q loss
        td_error = q_train - q_target  # TODO: maybe it needs to be q_target - q_train
        ms_td_error = tf.reduce_mean(tf.square(td_error), 0)
        wd_q = tf.add_n([ql2 * tf.nn.l2_loss(var)
                         for var in self.theta_q])  # weight decay
        loss_q = ms_td_error + wd_q
        # q optimization
        optim_q = tf.train.AdamOptimizer(learning_rate=lrq)
        grads_and_vars_q = optim_q.compute_gradients(loss_q,
                                                     var_list=self.theta_q)
        optimize_q = optim_q.apply_gradients(grads_and_vars_q)
        with tf.control_dependencies([optimize_q]):
            train_q = tf.group(update_qt)

        # logging
        log_obs = [] if dimO[0] > 20 else [
            tf.histogram_summary("obs/" + str(i), obs[:, i])
            for i in range(dimO[0])
        ]
        log_act = [] if dimA[0] > 20 else [
            tf.histogram_summary("act/inf" + str(i), act_test[:, i])
            for i in range(dimA[0])
        ]
        log_act2 = [] if dimA[0] > 20 else [
            tf.histogram_summary("act/train" + str(i), act_train[:, i])
            for i in range(dimA[0])
        ]
        log_misc = [sum_p, sum_qq, tf.histogram_summary("td_error", td_error)]
        log_grad = [
            grad_histograms(grads_and_vars_p),
            grad_histograms(grads_and_vars_q)
        ]
        log_train = log_obs + log_act + log_act2 + log_misc + log_grad

        # initialize tf log writer
        self.writer = tf.train.SummaryWriter(FLAGS.outdir + "/tf",
                                             self.sess.graph,
                                             flush_secs=20)

        # init replay memory for recording episodes
        max_ep_length = 10000
        self.rm_log = ReplayMemory(max_ep_length, dimO, dimA, env_dtype)

        # tf functions
        with self.sess.as_default():
            # self._act_test = Fun(obs,act_test)
            # self._act_expl = Fun(obs,act_expl)
            # self._reset = Fun([],self.ou_reset)
            # self._train_q = Fun([obs,act_train,rew,obs2,term2],[train_q],log_train,self.writer)
            # self._train_p = Fun([obs],[train_p],log_train,self.writer)
            # self._train = Fun([obs,act_train,rew,obs2,term2],[train_p,train_q],log_train,self.writer)

            self._act_test = Fun([obs, is_training], act_test)
            self._act_expl = Fun([obs, is_training], act_expl)
            self._reset = Fun([], self.ou_reset)
            self._train_q = Fun(
                [obs, act_train, g_act_train, rew, obs2, term2, is_training],
                [train_q], log_train, self.writer)
            self._train_p = Fun([obs, is_training], [train_p], log_train,
                                self.writer)
            self._train = Fun(
                [obs, act_train, g_act_train, rew, obs2, term2, is_training],
                [train_p, train_q], log_train, self.writer)
            self._wolpertinger_p = Fun(
                [obs, act_cont, g_actions, rew_g, term_g, is_training],
                [wolpertinger_policy])

        # initialize tf variables
        self.saver = tf.train.Saver(max_to_keep=1)
        ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf")
        if ckpt:
            print "==> RESTORING VARIABLES FROM CHECKPOINT: {}".format(ckpt)
            self.saver.restore(self.sess, ckpt)
        else:
            self.sess.run(tf.initialize_all_variables())

        self.sess.graph.finalize()

        self.t = 0  # global training time (number of observations)

    def reset(self, obs):
        self._reset()
        self.observation = obs  # initial observation

    def act(self, test=False):
        obs = np.expand_dims(self.observation, axis=0)
        # action = self._act_test(obs) if test else self._act_expl(obs)
        action = self._act_test(obs, False) if test else self._act_expl(
            obs, True)
        self.action = np.atleast_1d(np.squeeze(
            action, axis=0))  # TODO: remove this hack
        return self.action

    def wolpertinger_policy(self, action_cont, g_actions, rew_g, term_g):
        obs = np.expand_dims(self.observation, axis=0)
        action_cont = np.expand_dims(action_cont, axis=0)
        # rew_g = np.expand_dims(rew_g, axis=0)
        # return np.asarray( self._wolpertinger_p(obs, action_cont, g_actions, rew_g, term_g) )
        i = 0
        q_values = []
        for g_action in g_actions:
            g_action = np.expand_dims(g_action, axis=0)
            q_values.append(
                self._wolpertinger_p(obs, action_cont, g_action, [rew_g[i]],
                                     [term_g[i]])[0])
            i += 1

        # return self._wolpertinger_p(obs, action_cont, g_actions, rew_g, term_g)[0]
        return np.argmax(q_values)

    def observe(self, rew, term, obs2, test=False, g_action=None):

        obs1 = self.observation
        self.observation = obs2

        # train
        if not test:
            self.t = self.t + 1
            self.rm.enqueue(obs1, term, self.action, g_action, rew)

            if self.t > FLAGS.warmup:
                self.train()

            elif FLAGS.warmq and self.rm.n > 1000:
                # Train Q on warmup
                obs, act, g_act, rew, ob2, term2, info = self.rm.minibatch(
                    size=FLAGS.bsize)
                # self._train_q(obs,act,rew,ob2,term2, log = (np.random.rand() < FLAGS.log), global_step=self.t)
                for i in xrange(FLAGS.iter):
                    self._train_q(obs,
                                  act,
                                  g_act,
                                  rew,
                                  ob2,
                                  term2,
                                  True,
                                  log=(np.random.rand() < FLAGS.log),
                                  global_step=self.t)

            # save parameters etc.
            # if (self.t+45000) % 50000 == 0: # TODO: correct
            #   s = self.saver.save(self.sess,FLAGS.outdir+"f/tf/c",self.t)
            #   print("DDPG Checkpoint: " + s)

    def checkpoint_session(self):
        return self.saver.save(self.sess, FLAGS.outdir + "/tf/c", self.t)

    def train(self):
        if not self.started_train:
            with open(os.path.join(FLAGS.outdir, "output.log"), mode='a') as f:
                f.write('===> Warm up complete\n')
            self.started_train = True

        obs, act, g_act, rew, ob2, term2, info = self.rm.minibatch(
            size=FLAGS.bsize)
        log = (np.random.rand() < FLAGS.log)

        if FLAGS. async:
            # self._train(obs,act,rew,ob2,term2, log = log, global_step=self.t)
            for i in xrange(FLAGS.iter):
                self._train(obs,
                            act,
                            g_act,
                            rew,
                            ob2,
                            term2,
                            True,
                            log=log,
                            global_step=self.t)
        else:
            # self._train_q(obs,act,rew,ob2,term2, log = log, global_step=self.t)
            # self._train_p(obs, log = log, global_step=self.t)
            for i in xrange(FLAGS.iter):
                self._train_q(obs,
                              act,
                              g_act,
                              rew,
                              ob2,
                              term2,
                              True,
                              log=log,
                              global_step=self.t)
                self._train_p(obs, True, log=log, global_step=self.t)

    def write_scalar(self, tag, val):
        s = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=val)])
        self.writer.add_summary(s, self.t)

    def __del__(self):
        self.sess.close()