def build_evaluator_model(kwargs): Model = warp_Model() import tensorflow as tf from module import RMCRNN from module import TmpHierRMCRNN from module import TmpHierRNN frames = kwargs["frames"] act_space = kwargs["act_space"] state_size = kwargs["state_size"] use_rmc = kwargs["use_rmc"] use_hrmc = kwargs["use_hrmc"] use_hrnn = kwargs["use_hrnn"] use_reward_prediction = kwargs["use_reward_prediction"] after_rnn = kwargs["after_rnn"] use_pixel_control = kwargs["use_pixel_control"] use_pixel_reconstruction = kwargs["use_pixel_reconstruction"] phs = dict() phs["s"] = tf.placeholder(dtype=tf.float32, shape=[None, None, 84, 84, frames]) phs["prev_a"] = tf.placeholder(dtype=tf.int32, shape=[None, None]) phs["prev_r"] = tf.placeholder(dtype=tf.float32, shape=[None, None]) phs["state_in"] = tf.placeholder(dtype=tf.float32, shape=[None, state_size]) if use_hrnn: rnn = TmpHierRNN(4, 64, 4, 2, 8, 'lstm', 'rmc', return_sequences=True, return_state=True, name="hrnn") elif use_hrmc: rnn = TmpHierRMCRNN(4, 64, 4, 4, return_sequences=True, return_state=True, name="hrmcrnn") elif use_rmc: rnn = RMCRNN(64, 4, 4, return_sequences=True, return_state=True, name="rmcrnn") else: rnn = tf.compat.v1.keras.layers.LSTM(256, return_sequences=True, return_state=True, name="lstm") model = Model(act_space, rnn, use_rmc, use_hrmc or use_hrnn, use_reward_prediction, after_rnn, use_pixel_control, use_pixel_reconstruction, "agent", **phs) return model
def run(): CKPT_DIR = "/".join(os.getcwd().split("/")[:-2]) + "/ckpt/ppo16" frames = 1 action_repeats = [1] MAX_STEPS = 320000 act_space = 12 use_rmc = False use_hrmc = True use_reward_prediction = False use_pixel_control = False after_rnn = False sess = tf.Session() phs = dict() phs["s"] = tf.placeholder(dtype=tf.float32, shape=[None, None, 84, 84, frames]) phs["prev_a"] = tf.placeholder(dtype=tf.int32, shape=[None, None]) phs["prev_r"] = tf.placeholder(dtype=tf.float32, shape=[None, None]) # phs["a"] = tf.placeholder(dtype=tf.int32, shape=[None, None]) # phs["a_logits"] = tf.placeholder(dtype=tf.float32, shape=[None, None, act_space]) # phs["adv"] = tf.placeholder(dtype=tf.float32, shape=[None, None]) phs["v_cur"] = tf.placeholder(dtype=tf.float32, shape=[None, None]) # phs["slots"] = tf.placeholder(dtype=tf.float32, shape=[None, None]) if use_hrmc: state_size = 1 + 2 * (4 + 4) * 4 * 64 phs["state_in"] = tf.placeholder(dtype=tf.float32, shape=[None, state_size]) lstm = TmpHierRMCRNN(4, 64, 4, 4, return_sequences=True, return_state=True, name="hrmcrnn") elif use_rmc: state_size = 64 * 4 * 4 phs["state_in"] = tf.placeholder(dtype=tf.float32, shape=[None, 64 * 4 * 4]) lstm = RMCRNN(64, 4, 4, return_sequences=True, return_state=True, name="rmcrnn") else: state_size = 256 * 2 phs["state_in"] = tf.placeholder(dtype=tf.float32, shape=[None, 256 * 2]) lstm = tf.compat.v1.keras.layers.LSTM(256, return_sequences=True, return_state=True, name="lstm") model = Model(act_space, lstm, use_rmc, use_hrmc, use_reward_prediction, after_rnn, use_pixel_control, "agent", **phs) saver = tf.train.Saver(max_to_keep=None, keep_checkpoint_every_n_hours=6) ckpt = tf.train.get_checkpoint_state(CKPT_DIR) saver.restore( sess, os.path.join(CKPT_DIR, ckpt.model_checkpoint_path.split("/")[-1])) envs = [] games = [ "SuperMarioBros-1-1-v0", "SuperMarioBros-2-1-v0", "SuperMarioBros-4-1-v0", "SuperMarioBros-5-1-v0" ] # games = ["SuperMarioBros-2-3-v0", # "SuperMarioBros-5-2-v0", # "SuperMarioBros-7-1-v0", # "SuperMarioBros-7-3-v0", # "SuperMarioBros-8-1-v0", # "SuperMarioBros-8-2-v0", # "SuperMarioBros-8-3-v0"] games = [ "SuperMarioBros-%d-%d-v0" % (i, j) for i in [6] for j in [1, 2, 3, 4] ] for i in range(len(games)): env = Env(12, action_repeats, frames, state_size, games[i]) envs.append(env) while True: for i in range(MAX_STEPS): _s_t_batch = [env.get_state()[None, :, :, :] for env in envs] _a_t_batch = [[env.get_act()] for env in envs] _r_t_batch = [[env.r[-1]] for env in envs] _state_in_batch = [env.get_state_in() for env in envs] _a_t_new, _a_t_logits, _v_cur, _state_out_batch = sess.run( [ model.get_current_act(), model.get_current_act_logits(), model.current_value, model.state_out ], feed_dict={ model.s_t: _s_t_batch, model.previous_actions: _a_t_batch, model.prev_r: _r_t_batch, model.state_in: _state_in_batch }) # _a_t_new = np.argmax(_a_t_logits, axis=-1) [ env.step(_a_t_new[i][0], _a_t_logits[i][0], _state_out_batch[i]) for (i, env) in enumerate(envs) ] [env.update_v(_v_cur[i][0]) for (i, env) in enumerate(envs)] force = False if i == MAX_STEPS - 1: force = True [env.reset(force) for env in envs]
def build_learner(pre, post, act_space, num_frames): global_step = tf.train.get_or_create_global_step() init_lr = FLAGS.init_lr decay = FLAGS.lr_decay warmup_steps = FLAGS.warmup_steps use_rmc = FLAGS.use_rmc use_hrmc = FLAGS.use_hrmc use_hrnn = FLAGS.use_hrnn use_icm = FLAGS.use_icm use_coex = FLAGS.use_coex use_reward_prediction = FLAGS.use_reward_prediction after_rnn = FLAGS.after_rnn use_pixel_control = FLAGS.use_pixel_control use_pixel_reconstruction = FLAGS.use_pixel_reconstruction pq_kl_coef = FLAGS.pq_kl_coef p_kl_coef = FLAGS.p_kl_coef global_step_float = tf.cast(global_step, tf.float32) lr = tf.train.polynomial_decay( init_lr, global_step, FLAGS.total_environment_frames // (FLAGS.batch_size * FLAGS.seqlen), init_lr / 10.) is_warmup = tf.cast(global_step_float < warmup_steps, tf.float32) lr = is_warmup * global_step_float / warmup_steps * init_lr + ( 1.0 - is_warmup) * (init_lr * (1.0 - decay) + lr * decay) optimizer = tf.train.AdamOptimizer(lr) ent_coef = tf.train.polynomial_decay( FLAGS.ent_coef, global_step, FLAGS.total_environment_frames // (FLAGS.batch_size * FLAGS.seqlen), FLAGS.ent_coef / 10.) if FLAGS.zero_init: pre["state_in"] = tf.zeros_like(pre["state_in"]) if use_hrnn: rnn = TmpHierRNN(4, 64, 4, 2, 8, 'lstm', 'rmc', return_sequences=True, return_state=True, name="hrnn") elif use_hrmc: rnn = TmpHierRMCRNN(4, 64, 4, 4, return_sequences=True, return_state=True, name="hrmcrnn") elif use_rmc: rnn = RMCRNN(64, 4, 4, return_sequences=True, return_state=True, name="rmcrnn") else: rnn = tf.compat.v1.keras.layers.LSTM(256, return_sequences=True, return_state=True, name="lstm") pre_model = Model(act_space, rnn, use_rmc, use_hrmc or use_hrnn, use_reward_prediction, after_rnn, use_pixel_reconstruction, "agent", **pre) post["state_in"] = tf.stop_gradient(pre_model.state_out) post_model = Model(act_space, rnn, use_rmc, use_hrmc or use_hrnn, use_reward_prediction, after_rnn, use_pixel_reconstruction, "agent", **post) tf.summary.scalar("adv_mean", post_model.adv_mean) tf.summary.scalar("adv_std", post_model.adv_std) losses = dPPOcC(act=post_model.a_t, policy_logits=post_model.current_act_logits, old_policy_logits=post_model.old_act_logits, advantage=post_model.advantage, policy_clip=FLAGS.ppo_clip, vf=post_model.current_value, vf_target=post_model.ret, value_clip=FLAGS.vf_clip, old_vf=post_model.old_current_value) entropy_loss = tf.reduce_mean( entropy(post_model.current_act_logits) * post_model.slots) p_loss = tf.reduce_mean(losses.p_loss * post_model.slots) v_loss = tf.reduce_mean(losses.v_loss * post_model.slots) add_loss = 0.0 if use_icm: icmloss = icm(post_model.cnn_feature[:, :-1, :], post_model.cnn_feature[:, 1:, :], post_model.a_t[:, :-1], act_space) add_loss += 0.2 * tf.reduce_mean( icmloss.f_loss * post_model.slots[:, :-1]) + 0.8 * tf.reduce_mean( icmloss.i_loss * post_model.slots[:, :-1]) if use_coex: coexloss = coex(post_model.image_feature[:, :-1, :, :, :], post_model.image_feature[:, 1:, :, :, :], post_model.a_t[:, :-1], act_space) add_loss += tf.reduce_mean(coexloss * post_model.slots[:, :-1]) if use_hrmc or use_hrnn: pq_kl_loss = KL_from_gaussians(post_model.q_mus, post_model.q_sigmas, post_model.p_mus, post_model.p_sigmas) pq_kl_loss = tf.reduce_mean(pq_kl_loss * post_model.slots) tf.summary.scalar("kl_div", pq_kl_loss) add_loss += pq_kl_coef * pq_kl_loss p_kl_loss = KL_from_gaussians(post_model.p_mus, post_model.p_sigmas, tf.zeros_like(post_model.p_mus), 0.01 * tf.ones_like(post_model.p_sigmas)) p_kl_loss = tf.reduce_mean(p_kl_loss * post_model.slots) tf.summary.scalar("kl_div_prior", p_kl_loss) add_loss += p_kl_coef * p_kl_loss if use_reward_prediction: r_loss = tf.reduce_mean( mse(post_model.reward_prediction, post_model.r_t) * post_model.slots) tf.summary.scalar("r_loss", r_loss) add_loss += r_loss if use_pixel_control: change_of_cells = tf.reduce_mean(post_model.s_t[:, 1:, :, :, :] - post_model.s_t[:, :-1, :, :, :], axis=-1) s_shape = get_shape(change_of_cells) s_H, s_W = s_shape[2:] ctr_H, ctr_W = get_shape(post_model.pixel_control)[2:4] change_of_cells = tf.reduce_mean(tf.reshape( change_of_cells, shape=s_shape[:2] + [ctr_H, s_H // ctr_H, ctr_W, s_W // ctr_W]), axis=(3, 5)) ctr = tf.reduce_sum( tf.transpose(post_model.pixel_control, perm=(0, 1, 4, 2, 3)) * tf.one_hot(post_model.a_t, depth=post_model.act_space, dtype=tf.float32)[:, :, :, None, None], axis=2)[:, :-1, :, :] ctr_loss = tf.reduce_mean(mse(ctr, change_of_cells)) tf.summary.scalar("pixel_control_loss", ctr_loss) add_loss += ctr_loss if use_pixel_reconstruction: rec_loss = tf.reduce_mean( mse(post_model.pixel_reconstruction, post_model.s_t) * post_model.slots[:, :, None, None, None]) tf.summary.scalar("rec_loss", rec_loss) add_loss += rec_loss loss = (FLAGS.pi_coef * p_loss + FLAGS.vf_coef * v_loss - ent_coef * entropy_loss + add_loss) train_op = miniOp(optimizer, loss, FLAGS.grad_clip) new_frames = tf.reduce_sum(post["slots"]) with tf.control_dependencies([train_op]): num_frames_and_train = tf.assign_add(num_frames, new_frames) global_step_and_train = tf.assign_add(global_step, 1) tf.summary.scalar("learning_rate", lr) tf.summary.scalar("ent_coef", ent_coef) tf.summary.scalar("ent_loss", entropy_loss) tf.summary.scalar("p_loss", p_loss) tf.summary.scalar("v_loss", v_loss) tf.summary.scalar("all_loss", loss) return num_frames_and_train, global_step_and_train
def run(**kwargs): tmplimit = 512 lifelong = None server_id = kwargs.get("server_id", 0) address = "ipc:///tmp/databack%d" % server_id SCRIPT_DIR = kwargs.get("SCRIPT_DIR") BASE_DIR = kwargs.get("BASE_DIR") CKPT_DIR = kwargs.get("CKPT_DIR") DATA_DIR = kwargs.get("DATA_DIR") logging.basicConfig(filename=os.path.join(BASE_DIR, "Serverlog"), level="INFO") frames = kwargs.get("frames", 1) workers = kwargs.get("workers", 16) parallel = kwargs.get("worker_parallel", 4) MAX_STEPS = kwargs.get("max_steps", 3200) seqlen = kwargs.get("seqlen", 32) burn_in = kwargs.get("burn_in", 32) act_space = kwargs.get("act_space", 7) use_rmc = kwargs.get("use_rmc", 0) use_hrmc = kwargs.get("use_hrmc", 0) use_reward_prediction = kwargs.get("use_reward_prediction", 0) use_pixel_control = kwargs.get("use_pixel_control", 0) games = [ "SuperMarioBros-%d-%d-v0" % (i, j) for i in range(1, 9) for j in range(1, 5) ] sess = tf.Session() phs = dict() phs["s"] = tf.placeholder(dtype=tf.float32, shape=[None, None, 84, 84, frames]) phs["prev_a"] = tf.placeholder(dtype=tf.int32, shape=[None, None]) phs["a"] = tf.placeholder(dtype=tf.int32, shape=[None, None]) phs["a_logits"] = tf.placeholder(dtype=tf.float32, shape=[None, None, act_space]) phs["r"] = tf.placeholder(dtype=tf.float32, shape=[None, None]) phs["prev_r"] = tf.placeholder(dtype=tf.float32, shape=[None, None]) phs["adv"] = tf.placeholder(dtype=tf.float32, shape=[None, None]) phs["v_cur"] = tf.placeholder(dtype=tf.float32, shape=[None, None]) phs["slots"] = tf.placeholder(dtype=tf.float32, shape=[None, None]) if use_hrmc: state_size = 1 + 2 * (4 + 4) * 4 * 64 phs["state_in"] = tf.placeholder(dtype=tf.float32, shape=[None, state_size]) lstm = TmpHierRMCRNN(4, 64, 4, 4, 4, return_sequences=True, return_state=True, name="hrmcrnn") elif use_rmc: state_size = 64 * 4 * 4 phs["state_in"] = tf.placeholder(dtype=tf.float32, shape=[None, 64 * 4 * 4]) lstm = RMCRNN(64, 4, 4, return_sequences=True, return_state=True, name="rmcrnn") else: state_size = 256 * 2 phs["state_in"] = tf.placeholder(dtype=tf.float32, shape=[None, 256 * 2]) lstm = tf.compat.v1.keras.layers.LSTM(256, return_sequences=True, return_state=True, name="lstm") model = Model(act_space, lstm, use_rmc, use_hrmc, use_reward_prediction, use_pixel_control, "agent", **phs) saver = tf.train.Saver(max_to_keep=None, keep_checkpoint_every_n_hours=6) ckpt = tf.train.get_checkpoint_state(CKPT_DIR) ckpt_path = None if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(tf.global_variables_initializer()) context = zmq.Context() frontend = context.socket(zmq.ROUTER) frontend.bind(address) queue_ins = OrderedDict() # queue_out = Queue(maxsize=3 * tmplimit) for i in range(workers): queue_in = Queue() worker_id = i queue_ins[worker_id] = queue_in worker = Process(target=Worker_Q, args=(queue_in, address, parallel, BASE_DIR, DATA_DIR, 3 * tmplimit, server_id, worker_id, "\t".join(games), frames, seqlen, burn_in, act_space, state_size)) worker.daemon = True worker.start() while True: ckpt = tf.train.get_checkpoint_state(CKPT_DIR) if ckpt is not None: new_ckpt_path = ckpt.model_checkpoint_path if new_ckpt_path != ckpt_path: ckpt_path = new_ckpt_path saver.restore(sess, ckpt_path) fd = {model.s_t: [], model.previous_actions: [], model.state_in: []} idx, msg = frontend.recv_multipart(copy=False) worker_id, databack = unpack(msg) s, a, r, state_in = databack fd[model.s_t] = s fd[model.previous_actions] = a fd[phs["prev_r"]] = r fd[model.state_in] = state_in _a_t_new, _a_t_logits, _v_cur, _state_out_batch = sess.run( [ model.get_current_act(), model.get_current_act_logits(), model.current_value, model.state_out ], feed_dict=fd) dataforward = (_a_t_new, _a_t_logits, _state_out_batch, _v_cur) queue_ins[worker_id].put(dataforward)