def __init__(self, arglist): self.env_name = arglist.game self.vae = ConvVAE(batch_size=1, gpu_mode=False, is_training=False, reuse=True) self.vae.load_json(arglist.vae_file) self.rnn = MDNRNN(hps_sample, gpu_mode=False, reuse=True) self.rnn.load_json(arglist.rnn_file) self.state = rnn_init_state(self.rnn) self.rnn_mode = True self.input_size = rnn_output_size(EXP_MODE) self.z_size = 32 if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer self.hidden_size = 40 self.weight_hidden = np.random.randn(self.input_size, self.hidden_size) self.bias_hidden = np.random.randn(self.hidden_size) self.weight_output = np.random.randn(self.hidden_size, 2) self.bias_output = np.random.randn(2) self.param_count = ((self.input_size + 1) * self.hidden_size) + (self.hidden_size * 2 + 2) else: self.weight = np.random.randn(self.input_size, 2) self.bias = np.random.randn(2) self.param_count = (self.input_size) * 2 + 2 self.render_mode = False
def __init__(self, load_model=True): self.env_name = './VisualPushBlock_withBlock_z_info.x86_64' #'./VisualPushBlock.x86_64' self.vae = ConvVAE(batch_size=1, gpu_mode=False, is_training=False, reuse=True) self.rnn = MDNRNN(hps_sample, gpu_mode=False, reuse=True) if load_model: self.vae.load_json('vae/vae.json') self.rnn.load_json('rnn/rnn.json') self.state = rnn_init_state(self.rnn) self.rnn_mode = True self.input_size = rnn_output_size(EXP_MODE) self.z_size = z_size if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer ###CHANGE is made here self.hidden_size = 40 self.weight_hidden = np.random.randn(self.input_size, self.hidden_size) self.bias_hidden = np.random.randn(self.hidden_size) self.weight_output = np.random.randn(self.hidden_size, ACTION_SIZE) self.bias_output = np.random.randn(ACTION_SIZE) self.param_count = ((self.input_size + 1) * self.hidden_size) + ( self.hidden_size * ACTION_SIZE + ACTION_SIZE) else: self.weight = np.random.randn(self.input_size, ACTION_SIZE) self.bias = np.random.randn(ACTION_SIZE) self.param_count = (self.input_size) * ACTION_SIZE + ACTION_SIZE self.render_mode = False
def __init__(self): self.env_name = "carracing" self.vae = ConvVAE(batch_size=1, gpu_mode=False, is_training=False, reuse=True) self.vae.load_json('vae/vae.json') self.rnn = MDNRNN(hps_sample, gpu_mode=False, reuse=True) self.rnn.load_json('rnn/rnn.json') self.state = rnn_init_state(self.rnn) self.rnn_mode = True self.input_size = rnn_output_size(EXP_MODE) self.z_size = 32 if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer self.hidden_size = 40 self.weight_hidden = np.random.randn(self.input_size, self.hidden_size) self.bias_hidden = np.random.randn(self.hidden_size) self.weight_output = np.random.randn(self.hidden_size, 3) self.bias_output = np.random.randn(3) self.param_count = ((self.input_size+1)*self.hidden_size) + (self.hidden_size*3+3) else: self.weight = np.random.randn(self.input_size, 3) self.bias = np.random.randn(3) self.param_count = (self.input_size)*3+3 self.render_mode = False
def __init__(self, load_model=True): # For Mac # self.env_name = "/Users/intuinno/codegit/pushBlock/app/mac/VisualPushBlockContinuous" # For linux self.env_name = "/home/intuinno/codegit/pushblock/app/linux/pushblock.x86_64" self.vae = ConvVAE(batch_size=1, gpu_mode=False, is_training=False, reuse=True) self.rnn = MDNRNN(hps_sample, gpu_mode=False, reuse=True) if load_model: self.vae.load_json('vae/vae.json') self.rnn.load_json('rnn/rnn.json') self.state = rnn_init_state(self.rnn) self.rnn_mode = True self.input_size = rnn_output_size(EXP_MODE) self.z_size = 32 if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer self.hidden_size = 40 self.weight_hidden = np.random.randn(self.input_size, self.hidden_size) self.bias_hidden = np.random.randn(self.hidden_size) self.weight_output = np.random.randn(self.hidden_size, 3) self.bias_output = np.random.randn(3) self.param_count = ((self.input_size+1)*self.hidden_size) + (self.hidden_size*3+3) else: self.weight = np.random.randn(self.input_size, 3) self.bias = np.random.randn(3) self.param_count = (self.input_size)*3+3 self.render_mode = False
def __init__(self, args, load_model=True, full_episode=False, with_obs=False): super(CarRacingMDNRNN, self).__init__(full_episode=full_episode) self.with_obs = with_obs # whether or not to return the frame with the encodings self.vae = CVAE(args) self.rnn = MDNRNN(args) if load_model: self.vae.set_weights( tf.keras.models.load_model('results/{}/{}/tf_vae'.format( args.exp_name, args.env_name), compile=False).get_weights()) self.rnn.set_weights( tf.keras.models.load_model('results/{}/{}/tf_rnn'.format( args.exp_name, args.env_name), compile=False).get_weights()) self.rnn_states = rnn_init_state(self.rnn) self.full_episode = False self.observation_space = Box(low=np.NINF, high=np.Inf, shape=(32 + 256))
def __init__(self, args, load_model=True, full_episode=False, with_obs=False): super(CarRacingMDNRNN, self).__init__(full_episode=full_episode) self.with_obs = with_obs # whether or not to return the frame with the encodings self.vae = CVAE(args) self.rnn = MDNRNN(args) if load_model: self.vae.set_weights([ param_i.numpy() for param_i in tf.saved_model.load( 'results/{}/tf_vae'.format(args.env_name)).variables ]) self.rnn.set_weights([ param_i.numpy() for param_i in tf.saved_model.load( 'results/{}/tf_rnn'.format(args.env_name)).variables ]) self.rnn_states = rnn_init_state(self.rnn) self.full_episode = False self.observation_space = Box(low=np.NINF, high=np.Inf, shape=(args.z_size + args.rnn_size * args.state_space))
def reset(self, test=False): self.state_rnn = rnn_init_state(self.rnn) if self.seed: self.env.seed(random.choice(self.seed)) self.flip_episode = random.random() > 0.5 and not test and self.flip state, self.state_rnn = self.encode_obs(self.env.reset(), self.state_rnn, np.array([0.5, 0.2, 0.8])) return state, 1
def __init__(self, arglist, action_space, scope, load_model=True): self.action_space = action_space self.arglist = arglist self.vae = ConvVAE(batch_size=1, gpu_mode=False, is_training=False, reuse=True) hps_sample = hps_model._replace( batch_size=1, input_seq_width=32 + arglist.action_space + (arglist.agent_num - 1) * arglist.action_space * arglist.timestep, max_seq_len=1, use_recurrent_dropout=0, is_training=0) self.rnn = MDNRNN(hps_sample, gpu_mode=False, reuse=True) if load_model: self.vae.load_json(arglist.vae_model_dir) self.rnn.load_json(arglist.rnn_model_dir) self.state = rnn_init_state(self.rnn) self.rnn_mode = True if arglist.inference: self.input_size = rnn_output_size( EXP_MODE) + (arglist.agent_num - 1) * arglist.action_space else: self.input_size = rnn_output_size(EXP_MODE) self.z_size = 32 # action trajectories recording self.act_traj = [ collections.deque(np.zeros( (arglist.timestep, arglist.action_space)), maxlen=arglist.timestep) ] * (arglist.agent_num - 1) self.oppo_model = Oppo_Model(arglist.agent_num, arglist.timestep, arglist.action_space, arglist.action_space, "oppo_model_{}".format(scope)) self.inference = arglist.inference if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer self.hidden_size = 40 self.weight_hidden = np.random.randn(self.input_size, self.hidden_size) self.bias_hidden = np.random.randn(self.hidden_size) self.weight_output = np.random.randn(self.hidden_size, self.action_space) self.bias_output = np.random.randn(self.action_space) self.param_count = ((self.input_size + 1) * self.hidden_size) + ( self.hidden_size * self.action_space + self.action_space) else: self.weight = np.random.randn(self.input_size, self.action_space) self.bias = np.random.randn(self.action_space) self.param_count = ( self.input_size) * self.action_space + self.action_space
def reset(self): self.rnn_states = rnn_init_state(self.rnn) if self.with_obs: [z_state, obs] = super(CarRacingMDNRNN, self).reset() # calls step self.N_tiles = len(self.track) return [z_state, obs] else: z_state = super(CarRacingMDNRNN, self).reset() # calls step self.N_tiles = len(self.track) return z_state
def _reset(self): obs = super(DoomTakeCoverMDNRNN, self)._reset() small_obs = self._process_frame(obs) self.current_obs = small_obs self.rnn_states = rnn_init_state(self.rnn) self.z = self._encode(small_obs) self.restart = 1 self.frame_count = 0 if self.with_obs: return [self._current_state(), self.current_obs] else: return self._current_state()
def reset(self): self.rnn_states = rnn_init_state(self.rnn) obs = super(CarRacingMDNRNN, self).reset() obs = self._process_frame(obs) z = self.encode_obs(obs) h = tf.squeeze(self.rnn_states[0]) z_h = tf.concat([z, h], axis=-1) if self.with_obs: return [z_h, obs] else: z_h = super(CarRacingMDNRNN, self).reset() # calls step return z_h
def __init__(self, load_model=True, full_episode=False): super(CarRacingMDNRNN, self).__init__(full_episode=full_episode) self.vae = CVAE(batch_size=1) self.rnn = MDNRNN(hps_sample) if load_model: self.vae.load_json('tf_vae/vae.json') self.rnn.load_json('tf_rnn/rnn.json') self.rnn_states = rnn_init_state(self.rnn) self.full_episode = False self.observation_space = Box(low=np.NINF, high=np.Inf, shape=(32+256))
def reset(self, **kwargs): # TODO: Is this zeroes? Can this just be None? self.rnn_state = rnn_init_state(self.rnn) # [h, c] # reset() of wrapped environment might call step() and already set self.z in the process self.z = None obs = self.env.reset(**kwargs) # If step() was not called, calculate z from initial observation if self.z is None: self.z = self.encode_image_to_z(obs) obs = self.modify_observation(obs) return obs
def __init__(self, index=0, rl_training=False, restore_model=False, global_buffer=None, net=None, strategy_agent=None, greedy_action=False, extract_save_dir=None, load_model=True, ntype='worldmodel'): super(MiniSourceAgent, self).__init__() self.net = net self.index = index self.global_buffer = global_buffer self.restore_model = restore_model # model in brain self.strategy_agent = strategy_agent self.strategy_act = None # count num self.step = 0 self.strategy_wait_secs = 2 self.strategy_flag = False self.policy_wait_secs = 2 self.policy_flag = True self.env = None self.obs = None # buffer self.local_buffer = Buffer() self.num_players = 2 self.on_select = None self._result = None self._gases = None self.is_end = False self.greedy_action = greedy_action self.rl_training = rl_training self.extract_save_dir = extract_save_dir self.rnn_state = rnn_init_state(self.net.rnn)
def reset(self): super(MiniSourceAgent, self).reset() self.step = 0 self.obs = None self._result = None self._gases = None self.is_end = False self.strategy_flag = False self.policy_flag = True self.local_buffer.reset() if self.strategy_agent is not None: self.strategy_agent.reset() self.rnn_state = rnn_init_state(self.net.rnn)
def __init__(self, model_name='', load_model=True, load_full_model=False, full_model_path=''): self.model_name = model_name self.env_name = "carracing" self.vae = ConvVAE(batch_size=1, gpu_mode=False, is_training=False, reuse=True) self.rnn = MDNRNN(hps_sample, gpu_mode=False, reuse=True) if load_full_model: self.vae.load_json(os.path.join(full_model_path, 'vae.json')) self.rnn.load_json(os.path.join(full_model_path, 'rnn.json')) elif load_model: self.vae.load_json( os.path.join(vae_path, self.model_name + '_vae.json')) self.rnn.load_json( os.path.join(rnn_path, self.model_name + '_rnn.json')) self.state = rnn_init_state(self.rnn) self.rnn_mode = True self.input_size = rnn_output_size(EXP_MODE) self.z_size = 32 if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer self.hidden_size = 40 self.weight_hidden = np.random.randn(self.input_size, self.hidden_size) self.bias_hidden = np.random.randn(self.hidden_size) self.weight_output = np.random.randn(self.hidden_size, 3) self.bias_output = np.random.randn(3) self.param_count = ((self.input_size + 1) * self.hidden_size) + (self.hidden_size * 3 + 3) else: self.weight = np.random.randn(self.input_size, 3) self.bias = np.random.randn(3) self.param_count = (self.input_size) * 3 + 3 self.render_mode = False
def __init__(self, load_model=True): self.env_name = "Pong" self._make_env() self.vae = ConvVAE(batch_size=1, gpu_mode=False, is_training=False, reuse=True) hps_sample_dynamic = hps_sample._replace(num_actions=self.num_actions) self.rnn = MDNRNN(hps_sample_dynamic, gpu_mode=False, reuse=True) if load_model: self.vae.load_json('vae/vae.json') self.rnn.load_json('rnn/rnn.json') self.state = rnn_init_state(self.rnn) self.rnn_mode = True self.input_size = rnn_output_size(EXP_MODE) self.z_size = 32 if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer raise Exception("not ported for atari") self.hidden_size = 40 self.weight_hidden = np.random.randn(self.input_size, self.hidden_size) self.bias_hidden = np.random.randn(self.hidden_size) self.weight_output = np.random.randn(self.hidden_size, self.num_actions) self.bias_output = np.random.randn(self.num_actions) self.param_count = ((self.input_size + 1) * self.hidden_size) + ( (self.hidden_size + 1) * self.num_actions) else: # TODO: Not known until env.action_space is queried... self.weight = np.random.randn(self.input_size, self.num_actions) self.bias = np.random.randn(self.num_actions) self.param_count = (self.input_size + 1) * self.num_actions self.render_mode = False
def __init__(self, load_model=True, env_name="Pong-v0", render_mode=False): self.env_name = env_name self.make_env() self.z_size = 32 self.vae = ConvVAE(batch_size=1, gpu_mode=False, is_training=False, reuse=True) hps_atari = hps_sample._replace(input_seq_width=self.z_size + self.na) self.rnn = MDNRNN(hps_atari, gpu_mode=False, reuse=True) if load_model: self.vae.load_json('vae/vae.json') self.rnn.load_json('rnn/rnn.json') self.state = rnn_init_state(self.rnn) self.rnn_mode = True self.input_size = rnn_output_size(EXP_MODE) self.init_controller() self.render_mode = False
def reset(self): self.rnn_state = rnn_init_state(self.rnn) self.z = self._sample_init_z() obs = OrderedDict(features=rnn_output(self.rnn_state, self.z, self.features_mode)) return obs
def reset(self): self.rnn_states = rnn_init_state(self.rnn) z_h = super(CarRacingWrapper, self).reset() # calls step return z_h
def reset(self): self.rnn_states = rnn_init_state(self.rnn) z = np.expand_dims(self._sample_init_z(), axis=0) self.o = z z_ch = tf.concat([z, self.rnn_states[1], self.rnn_states[0]], axis=-1) return tf.squeeze(z_ch)
def train(sess, env, actor, critic, global_step): # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) # load model if have saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state("./results") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) print("global step: ", global_step.eval()) else: print("Could not find old network weights") writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) state = rnn_init_state(rnn) i = global_step.eval() eps = 1 lr = INITIAL_LR while True: i += 1 s = env.reset() s, state = encode_obs(s, state, np.array([0.5, 0.2, 0.8])) # s = prepro(s) ep_reward = 0 ep_ave_max_q = 0 eps *= EPS_DECAY_RATE lr *= LR_DECAY_RATE lr = np.max([lr, MINI_LR]) # minimum of learning rate is MINI_LR if i % SAVE_STEP == 0: # save check point every 1000 episode sess.run(global_step.assign(i)) save_path = saver.save(sess, "./results/model.ckpt", global_step=global_step) print("Model saved in file: %s" % save_path) print("Successfully saved global step: ", global_step.eval()) for j in range(MAX_EP_STEPS): if RENDER_ENV: env.render() # print(s.shape) a = actor.predict(np.reshape(s, (-1, 16, 10, 2))) print(a) # action = a[0] + 1./(1+i+j) # add noise for exploration noise = np.random.normal(0, 0.2 * eps, 3) noise[1] = np.random.normal(0.4, 0.1 * eps) action = a[0] + noise s2, r, terminal, info = env.step(action) s2, state = encode_obs(s2, state, action) # s2 = prepro(s2) action = np.expand_dims(action, axis=0) # plt.imshow(s2) # plt.show() # if r > 0: # r = 1 # elif r < 0: # r = -1 # print 'r: ',r # replay_buffer.add(np.reshape(s, (96, 96, 3)), np.reshape(action, (actor.a_dim,)), r, # terminal, np.reshape(s2, (96, 96, 3)),lr) replay_buffer.add(s, np.reshape(action, (actor.a_dim, )), r, terminal, s2, lr) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch, lr_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets s2_batch = np.reshape(s2_batch, (64, 16, 10, 2)) target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Update the critic given the targets s_batch = np.reshape(s_batch, (64, 16, 10, 2)) predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)), lr_batch) ep_ave_max_q += np.amax(predicted_q_value) # print ep_ave_max_q # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) # print grads[0] actor.train(s_batch, grads[0], lr_batch) # Update target networks actor.update_target_network() critic.update_target_network() summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() print('| Reward : %.2i' % int(ep_reward), " | Episode", i, \ '| Qmax: %.4f' % (ep_ave_max_q / float(j))) s = s2 ep_reward += r if terminal: # summary_str = sess.run(summary_ops, feed_dict={ # summary_vars[0]: ep_reward, # summary_vars[1]: ep_ave_max_q / float(j) # }) # writer.add_summary(summary_str, i) # writer.flush() # print '| Reward: %.2i' % int(ep_reward), " | Episode", i, \ # '| Qmax: %.4f' % (ep_ave_max_q / float(j)) break
def reset(self): self.state = rnn_init_state(self.rnn)
vae.load_json(os.path.join('vae', 'vae.json')) # Fourth, build the RNN hps_atari_sample = hps_sample._replace(input_seq_width=z_size+na) OUTWIDTH = hps_atari_sample.output_seq_width rnn = MDNRNN(hps_atari_sample, gpu_mode=False) rnn.load_json(os.path.join('rnn', 'rnn.json')) print("All model loaded.") # Fifth, run the evaluation. -> We have no predictions about the first frame. start = time.time() state = rnn_init_state(rnn) # initialize the state. pz = None for i in range(steps): ob = obs[i:i+1] # (1, 64, 64, 1) action = oh_actions[i:i+1] # (1, n) z = vae.encode(ob) # (1, 32) VAE done! rnn_z = np.expand_dims(z, axis=0) # (1, 1, 32) action = np.expand_dims(action, axis=0) # (1, 1, n) input_x = np.concatenate([rnn_z, action], axis=2) # (1, 1, 32+n) feed = {rnn.input_x: input_x, rnn.initial_state: state} # predict the next state and next z.