class ModelMCTS(Model): def __init__(self, load_model=True): self.env_name = "carracing" self.env = make_env(self.env_name, seed=SEED, render_mode=render_mode, full_episode=False) self.vae = ConvVAE(batch_size=1, gpu_mode=False, is_training=False, reuse=True) self.rnn = MDNRNN(hps_sample, gpu_mode=False, reuse=True) if load_model: self.vae.load_json('../vae/vae.json') self.rnn.load_json('../rnn/rnn.json') self.state = rnn_init_state(self.rnn) self.rnn_mode = True self.input_size = rnn_output_size(EXP_MODE) self.z_size = 32 if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer self.hidden_size = 40 self.weight_hidden = np.random.randn(self.input_size, self.hidden_size) self.bias_hidden = np.random.randn(self.hidden_size) self.weight_output = np.random.randn(self.hidden_size, 3) self.bias_output = np.random.randn(3) self.param_count = ((self.input_size + 1) * self.hidden_size) + (self.hidden_size * 3 + 3) else: self.weight = np.random.randn(self.input_size, 3) self.bias = np.random.randn(3) self.param_count = (self.input_size) * 3 + 3 self.render_mode = False self.mct = None def get_action(self, z): a = random_linear_sample(-1, 1) b = random_linear_sample(0, 1) c = random_linear_sample(0, 1) actions = dp(a, b, c) action, self.mct = mcts.mcts(z, self.env, actions, old_tree=self.mct, tree_depth=6, simulate_depth=200) self.state = rnn_next_state(self.rnn, z, action, self.state) return action
class CarRacingMDNRNN(CarRacingWrapper): def __init__(self, load_model=True, full_episode=False): super(CarRacingMDNRNN, self).__init__(full_episode=full_episode) self.vae = CVAE(batch_size=1) self.rnn = MDNRNN(hps_sample) if load_model: self.vae.load_json('tf_vae/vae.json') self.rnn.load_json('tf_rnn/rnn.json') self.rnn_states = rnn_init_state(self.rnn) self.full_episode = False self.observation_space = Box(low=np.NINF, high=np.Inf, shape=(32+256)) def encode_obs(self, obs): # convert raw obs to z, mu, logvar result = np.copy(obs).astype(np.float)/255.0 result = result.reshape(1, 64, 64, 3) mu, logvar = self.vae.encode_mu_logvar(result) mu = mu[0] logvar = logvar[0] s = logvar.shape z = mu + np.exp(logvar/2.0) * np.random.randn(*s) return z, mu, logvar def reset(self): self.rnn_states = rnn_init_state(self.rnn) z_h = super(CarRacingWrapper, self).reset() # calls step return z_h def _step(self, action): obs, reward, done, _ = super(CarRacingMDNRNN, self)._step(action) z, _, _ = self.encode_obs(obs) h = tf.squeeze(self.rnn_states[0]) z_h = tf.concat([z, h], axis=-1) if action is not None: # don't compute state on reset self.rnn_states = rnn_next_state(self.rnn, z, action, self.rnn_states) return z_h, reward, done, {}
def sample_vae2(args): """ For vae from https://github.com/hardmaru/WorldModelsExperiments.git """ z_size = 32 batch_size = args.count learning_rate = 0.0001 kl_tolerance = 0.5 model_path_name = "tf_vae" reset_graph() vae = ConvVAE( z_size=z_size, batch_size=batch_size, learning_rate=learning_rate, kl_tolerance=kl_tolerance, is_training=False, reuse=False, gpu_mode=False) # use GPU on batchsize of 1000 -> much faster vae.load_json(os.path.join(model_path_name, 'vae.json')) z = np.random.normal(size=(args.count, z_size)) samples = vae.decode(z) input_dim = samples.shape[1:] n = args.count plt.figure(figsize=(20, 4)) plt.title('VAE samples') for i in range(n): ax = plt.subplot(2, n, i + 1) plt.imshow(samples[i].reshape(input_dim[0], input_dim[1], input_dim[2])) ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) #plt.savefig( image_path ) plt.show()
class CarRacing: # Parameters # - type: Name of environment. Default is classic Car Racing game, but can be changed to introduce perturbations in environment # - history_pick: Size of history # - seed: List of seeds to sample from during training. Default is none (random games) def __init__(self, type="CarRacing", history_pick=4, seed=None, detect_edges=False, detect_grass=False, flip=False): self.name = type + str(time.time()) random.seed(30) self.env = make_env('CarRacing-v0', random.randint(1,10000000), render_mode = False, full_episode = True) self.image_dimension = [64,64] self.history_pick = history_pick self.state_space_size = history_pick * np.prod(self.image_dimension) self.action_space_size = 5 self.state_shape = [None, self.history_pick] + list(self.image_dimension) self.history = [] self.action_dict = {0: [-1, 0, 0], 1: [1, 0, 0], 2: [0, 1, 0], 3: [0, 0, 0.8], 4: [0, 0, 0]} self.seed = seed self.detect_edges = detect_edges self.detect_grass = detect_grass self.flip = flip self.flip_episode = False self.vae = ConvVAE(batch_size=1, gpu_mode=False, is_training=False, reuse=True) self.rnn = MDNRNN(hps_sample, gpu_mode=False, reuse=True) self.vae.load_json('vae/vae.json') self.rnn.load_json('rnn/rnn.json') # returns a random action def sample_action_space(self): return np.random.randint(self.action_space_size) def map_action(self, action): if self.flip_episode and action <= 1: action = 1 - action return self.action_dict[action] # resets the environment and returns the initial state def reset(self, test=False): self.state_rnn = rnn_init_state(self.rnn) if self.seed: self.env.seed(random.choice(self.seed)) self.flip_episode = random.random() > 0.5 and not test and self.flip state, self.state_rnn = self.encode_obs(self.env.reset(), self.state_rnn, np.array([0.5, 0.2, 0.8])) return state, 1 # take action def step(self, action, test=False): action = self.map_action(action) total_reward = 0 n = 1 if test else random.choice([2, 3, 4]) for i in range(n): next_state, reward, done, info = self.env.step(action) next_state, self. state_rnn = self.encode_obs(next_state, self.state_rnn, action) total_reward += reward info = {'true_done': done} if done: break return next_state, total_reward, done, info, 1 def render(self): self.env.render() # process state and return the current history def process(self, state): self.add_history(state) in_grass = utils.in_grass(state) if len(self.history) < self.history_pick: zeros = np.zeros(self.image_dimension) result = np.tile(zeros, ((self.history_pick - len(self.history)), 1, 1)) result = np.concatenate((result, np.array(self.history))) else: result = np.array(self.history) return result, in_grass def add_history(self, state): if len(self.history) >= self.history_pick: self.history.pop(0) #temp = utils.process_image(state, detect_edges=self.detect_edges, flip=self.flip_episode) self.history.append(state) def __str__(self): return self.name + '\nseed: {0}\nactions: {1}'.format(self.seed, self.action_dict) def encode_obs(self, obs, prev_state, action): # convert raw obs to z, mu, logvar result = np.copy(obs).astype(np.float)/255.0 result = result.reshape(1, 64, 64, 3) mu, logvar = self.vae.encode_mu_logvar(result) mu = mu[0] logvar = logvar[0] s = logvar.shape z = mu + np.exp(logvar/2.0) * np.random.randn(*s) h = rnn_output(prev_state, z, 4) next_state = rnn_next_state(self.rnn, z, np.array(action), prev_state) return np.concatenate([h, z]), next_state
class Model: ''' simple one layer model for car racing ''' def __init__(self, load_model=True, env_name="Pong-v0", render_mode=False): self.env_name = env_name self.make_env() self.z_size = 32 self.vae = ConvVAE(batch_size=1, gpu_mode=False, is_training=False, reuse=True) hps_atari = hps_sample._replace(input_seq_width=self.z_size + self.na) self.rnn = MDNRNN(hps_atari, gpu_mode=False, reuse=True) if load_model: self.vae.load_json('vae/vae.json') self.rnn.load_json('rnn/rnn.json') self.state = rnn_init_state(self.rnn) self.rnn_mode = True self.input_size = rnn_output_size(EXP_MODE) self.init_controller() self.render_mode = False # INIT The Controller After the enviroment Creation. def make_env(self, seed=-1, render_mode=False): self.render_mode = render_mode self.env = make_env(self.env_name, seed=seed, render_mode=render_mode) self.na = self.env.action_space.n # discrete by default. def init_controller(self): if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer self.hidden_size = 40 self.weight_hidden = np.random.randn(self.input_size, self.hidden_size) self.bias_hidden = np.random.randn(self.hidden_size) self.weight_output = np.random.randn( self.hidden_size, self.na) # pong. Modify later. self.bias_output = np.random.randn(self.na) self.param_count = (self.input_size + 1) * self.hidden_size + ( self.hidden_size + 1) * self.na else: self.weight = np.random.randn(self.input_size, self.na) self.bias = np.random.randn(self.na) self.param_count = (self.input_size + 1) * self.na def reset(self): self.state = rnn_init_state(self.rnn) def encode_obs(self, obs): # convert raw obs to z, mu, logvar result = np.copy(obs).astype(np.float) / 255.0 result = result.reshape(1, 64, 64, 1) mu, logvar = self.vae.encode_mu_logvar(result) mu = mu[0] logvar = logvar[0] s = logvar.shape z = mu + np.exp(logvar / 2.0) * np.random.randn(*s) return z, mu, logvar def get_action(self, z, epsilon=0.0): h = rnn_output(self.state, z, EXP_MODE) ''' action = np.dot(h, self.weight) + self.bias action[0] = np.tanh(action[0]) action[1] = sigmoid(action[1]) action[2] = clip(np.tanh(action[2])) ''' if np.random.rand() < epsilon: action = np.random.randint(0, self.na) else: if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer h = np.maximum( np.dot(h, self.weight_hidden) + self.bias_hidden, 0) action = np.argmax( np.dot(h, self.weight_output) + self.bias_output) else: action = np.argmax(np.dot(h, self.weight) + self.bias) oh_action = np.zeros(self.na) oh_action[action] = 1 # action[1] = (action[1]+1.0) / 2.0 # action[2] = clip(action[2]) # TODO check about this fucntion self.state = rnn_next_state(self.rnn, z, oh_action, self.state) return action def set_model_params(self, model_params): if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer params = np.array(model_params) cut_off = (self.input_size + 1) * self.hidden_size params_1 = params[:cut_off] params_2 = params[cut_off:] self.bias_hidden = params_1[:self.hidden_size] self.weight_hidden = params_1[self.hidden_size:].reshape( self.input_size, self.hidden_size) self.bias_output = params_2[:self.na] self.weight_output = params_2[self.na:].reshape( self.hidden_size, self.na) else: self.bias = np.array(model_params[:self.na]) self.weight = np.array(model_params[self.na:]).reshape( self.input_size, self.na) def load_model(self, filename): with open(filename) as f: data = json.load(f) print('loading file %s' % (filename)) self.data = data model_params = np.array(data[0]) # assuming other stuff is in data self.set_model_params(model_params) def get_random_model_params(self, stdev=0.1): #return np.random.randn(self.param_count)*stdev return np.random.standard_cauchy( self.param_count) * stdev # spice things up def init_random_model_params(self, stdev=0.1): params = self.get_random_model_params(stdev=stdev) self.set_model_params(params) vae_params = self.vae.get_random_model_params(stdev=stdev) self.vae.set_model_params(vae_params) rnn_params = self.rnn.get_random_model_params(stdev=stdev) self.rnn.set_model_params(rnn_params)
class Model: ''' simple one layer model for car racing ''' def __init__(self, load_model=True): self.vae = ConvVAE(batch_size=1, gpu_mode=False, is_training=False, reuse=True) self.rnn = MDNRNN(hps_sample, gpu_mode=False, reuse=True) if load_model: self.vae.load_json('vae/vae.json') self.rnn.load_json('rnn/rnn.json') self.state = rnn_init_state(self.rnn) self.rnn_mode = True self.input_size = rnn_output_size(EXP_MODE) self.z_size = 32 if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer self.hidden_size = 40 self.weight_hidden = np.random.randn(self.input_size, self.hidden_size) self.bias_hidden = np.random.randn(self.hidden_size) self.weight_output = np.random.randn(self.hidden_size, 2) self.bias_output = np.random.randn(3) self.param_count = ((self.input_size + 1) * self.hidden_size) + (self.hidden_size * 2 + 2) else: self.weight = np.random.randn(self.input_size, 2) self.bias = np.random.randn(2) self.param_count = (self.input_size) * 2 + 2 def reset(self): self.state = rnn_init_state(self.rnn) def encode_obs(self, obs): # convert raw obs to z, mu, logvar result = np.copy(obs).astype(np.float) / 255.0 result = result.reshape(1, 64, 64, 3) mu, logvar = self.vae.encode_mu_logvar(result) mu = mu[0] logvar = logvar[0] s = logvar.shape z = mu + np.exp(logvar / 2.0) * np.random.randn(*s) return z, mu, logvar def get_action(self, z): h = rnn_output(self.state, z, EXP_MODE) ''' action = np.dot(h, self.weight) + self.bias action[0] = np.tanh(action[0]) action[1] = sigmoid(action[1]) action[2] = clip(np.tanh(action[2])) ''' if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden) action = np.tanh(np.dot(h, self.weight_output) + self.bias_output) else: action = np.tanh(np.dot(h, self.weight) + self.bias) action[0] = clip(action[0]) action[1] = clip(action[1]) self.state = rnn_next_state(self.rnn, z, action, self.state) if np.random.uniform(0, 1) < 0.2: action = [np.random.uniform(-2, 2), np.random.uniform(-2, 2)] return action def set_model_params(self, model_params): if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer params = np.array(model_params) cut_off = (self.input_size + 1) * self.hidden_size params_1 = params[:cut_off] params_2 = params[cut_off:] self.bias_hidden = params_1[:self.hidden_size] self.weight_hidden = params_1[self.hidden_size:].reshape( self.input_size, self.hidden_size) self.bias_output = params_2[:2] self.weight_output = params_2[2:].reshape(self.hidden_size, 2) else: self.bias = np.array(model_params[:2]) self.weight = np.array(model_params[2:]).reshape( self.input_size, 2) def load_model(self, filename): with open(filename) as f: data = json.load(f) print('loading file %s' % (filename)) self.data = data model_params = np.array(data[0]) # assuming other stuff is in data self.set_model_params(model_params) def get_random_model_params(self, stdev=0.1): #return np.random.randn(self.param_count)*stdev return np.random.standard_cauchy( self.param_count) * stdev # spice things up def init_random_model_params(self, stdev=0.1): params = self.get_random_model_params(stdev=stdev) self.set_model_params(params) vae_params = self.vae.get_random_model_params(stdev=stdev) self.vae.set_model_params(vae_params) rnn_params = self.rnn.get_random_model_params(stdev=stdev) self.rnn.set_model_params(rnn_params)
class Model: ''' simple one layer model for car racing ''' def __init__(self): self.env_name = "carracing" self.vae = ConvVAE(batch_size=1, gpu_mode=False, is_training=False, reuse=True) self.vae.load_json('vae/vae.json') self.rnn = MDNRNN(hps_sample, gpu_mode=False, reuse=True) self.rnn.load_json('rnn/rnn.json') self.state = rnn_init_state(self.rnn) self.rnn_mode = True self.input_size = rnn_output_size(EXP_MODE) self.z_size = 32 if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer self.hidden_size = 40 self.weight_hidden = np.random.randn(self.input_size, self.hidden_size) self.bias_hidden = np.random.randn(self.hidden_size) self.weight_output = np.random.randn(self.hidden_size, 3) self.bias_output = np.random.randn(3) self.param_count = ((self.input_size+1)*self.hidden_size) + (self.hidden_size*3+3) else: self.weight = np.random.randn(self.input_size, 3) self.bias = np.random.randn(3) self.param_count = (self.input_size)*3+3 self.render_mode = False def make_env(self, seed=-1, render_mode=False): self.render_mode = render_mode self.env = make_env(self.env_name, seed=seed, render_mode=render_mode) def reset(self): self.state = rnn_init_state(self.rnn) def encode_obs(self, obs): # convert raw obs to z, mu, logvar result = np.copy(obs).astype(np.float)/255.0 result = result.reshape(1, 64, 64, 3) mu, logvar = self.vae.encode_mu_logvar(result) mu = mu[0] logvar = logvar[0] s = logvar.shape z = mu + np.exp(logvar/2.0) * np.random.randn(*s) return z, mu, logvar def decode_obs(self, z): # decode the latent vector img = self.vae.decode(z.reshape(1, self.z_size)) * 255. img = np.round(img).astype(np.uint8) img = img.reshape(64, 64, 3) return img def get_action(self, z): h = rnn_output(self.state, z, EXP_MODE) ''' action = np.dot(h, self.weight) + self.bias action[0] = np.tanh(action[0]) action[1] = sigmoid(action[1]) action[2] = clip(np.tanh(action[2])) ''' if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden) action = np.tanh(np.dot(h, self.weight_output) + self.bias_output) else: action = np.tanh(np.dot(h, self.weight) + self.bias) action[1] = (action[1]+1.0) / 2.0 action[2] = clip(action[2]) self.state = rnn_next_state(self.rnn, z, action, self.state) return action def set_model_params(self, model_params): if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer params = np.array(model_params) cut_off = (self.input_size+1)*self.hidden_size params_1 = params[:cut_off] params_2 = params[cut_off:] self.bias_hidden = params_1[:self.hidden_size] self.weight_hidden = params_1[self.hidden_size:].reshape(self.input_size, self.hidden_size) self.bias_output = params_2[:3] self.weight_output = params_2[3:].reshape(self.hidden_size, 3) else: self.bias = np.array(model_params[:3]) self.weight = np.array(model_params[3:]).reshape(self.input_size, 3) def load_model(self, filename): with open(filename) as f: data = json.load(f) print('loading file %s' % (filename)) self.data = data model_params = np.array(data[0]) # assuming other stuff is in data self.set_model_params(model_params) def get_random_model_params(self, stdev=0.1): return np.random.randn(self.param_count)*stdev
def sample_vae2(args): """ For vae from https://github.com/hardmaru/WorldModelsExperiments.git """ z_size = 64 # This needs to match the size of the trained vae batch_size = args.count learning_rate = 0.0001 kl_tolerance = 0.5 model_path_name = "tf_vae" reset_graph() vae = ConvVAE( z_size=z_size, batch_size=batch_size, learning_rate=learning_rate, kl_tolerance=kl_tolerance, is_training=False, reuse=False, gpu_mode=False) # use GPU on batchsize of 1000 -> much faster vae.load_json(os.path.join(model_path_name, 'vae.json')) z = np.random.normal(size=(args.count, z_size)) samples = vae.decode(z) input_dim = samples.shape[1:] gen = DriveDataGenerator(args.dirs, image_size=(64, 64), batch_size=args.count, shuffle=True, max_load=10000, images_only=True) orig = gen[0].astype(np.float) / 255.0 #mu, logvar = vae.encode_mu_logvar(orig) #recon = vae.decode( mu ) recon = vae.decode(vae.encode(orig)) n = args.count plt.figure(figsize=(20, 6), tight_layout=False) plt.title('VAE samples') for i in range(n): ax = plt.subplot(3, n, i + 1) plt.imshow(samples[i].reshape(input_dim[0], input_dim[1], input_dim[2])) ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) if 0 == i: ax.set_title("Random") for i in range(n): ax = plt.subplot(3, n, n + i + 1) plt.imshow(orig[i].reshape(input_dim[0], input_dim[1], input_dim[2])) ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) if 0 == i: ax.set_title("Real") ax = plt.subplot(3, n, (2 * n) + i + 1) plt.imshow(recon[i].reshape(input_dim[0], input_dim[1], input_dim[2])) ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) if 0 == i: ax.set_title("Reconstructed") plt.savefig("samples_vae.png") plt.show()
filelist.sort() filelist = filelist[0:NUM_DATA] dataset, action_dataset = load_raw_data_list(filelist) reset_graph() vae = ConvVAE(z_size=z_size, batch_size=batch_size, learning_rate=learning_rate, kl_tolerance=kl_tolerance, is_training=False, reuse=False, gpu_mode=True) # use GPU on batchsize of 1000 -> much faster vae.load_json(os.path.join(model_path_name, args.name + '_vae.json')) mu_dataset = [] logvar_dataset = [] for i in range(len(dataset)): data_batch = dataset[i] mu, logvar, z = encode_batch(data_batch) mu_dataset.append(mu.astype(np.float16)) logvar_dataset.append(logvar.astype(np.float16)) if ((i + 1) % 100 == 0): print(i + 1) action_dataset = np.array(action_dataset) mu_dataset = np.array(mu_dataset) logvar_dataset = np.array(logvar_dataset)
class VAERacingStack(CarRacing): def __init__(self, full_episode=False, discrete_mode=False): super(VAERacingStack, self).__init__() self._internal_counter = 0 self.z_size = games['vae_racing_stack'].input_size self.vae = ConvVAE(batch_size=1, z_size=self.z_size, num_channel=FRAME_STACK, gpu_mode=False, is_training=False, reuse=True) self.vae.load_json('vae/vae_stack_' + str(FRAME_STACK) + '.json') self.full_episode = full_episode high = np.array([np.inf] * self.z_size) self.observation_space = Box(-high, high) self.cumulative_frames = None self._has_rendered = False self.discrete_mode = discrete_mode def _get_image(self, z, cumulative_frames): large_img = np.zeros((64 * 2, 64 * FRAME_STACK)) # decode the latent vector if z is not None: img = self.vae.decode(z.reshape(1, self.z_size)) * 255.0 img = np.round(img).astype(np.uint8) img = img.reshape(64, 64, FRAME_STACK) for i in range(FRAME_STACK): large_img[64:, i * 64:(i + 1) * 64] = img[:, :, i] if len(cumulative_frames) == FRAME_STACK: for i in range(FRAME_STACK): large_img[:64, i * 64:(i + 1) * 64] = cumulative_frames[i] large_img = large_img.astype(np.uint8) return large_img def _reset(self): self._internal_counter = 0 self.cumulative_frames = None self._has_rendered = False return super(VAERacingStack, self)._reset() def _render(self, mode='human', close=False): if mode == 'human' or mode == 'rgb_array': self._has_rendered = True return super(VAERacingStack, self)._render(mode=mode, close=close) def _step(self, action): if not self._has_rendered: self._render("rgb_array") self._has_rendered = False if action is not None: if not self.discrete_mode: action[0] = _clip(action[0], lo=-1.0, hi=+1.0) action[1] = _clip(action[1], lo=-1.0, hi=+1.0) action[1] = (action[1] + 1.0) / 2.0 action[2] = _clip(action[2]) else: ''' in discrete setting: if action[0] is the highest, then agent does nothing if action[1] is the highest, then agent hits the pedal if -action[1] is the highest, then agent hits the brakes if action[2] is the highest, then agent turns left if action[3] is the highest, then agent turns right ''' logits = [ _clip((action[0] + 1.0), hi=+2.0), _clip(action[1]), _clip(-action[1]), _clip(action[2]), _clip(-action[2]) ] probs = softmax(logits) #chosen_action = np.argmax(logits) chosen_action = sample(probs) a = np.array([0.0, 0.0, 0.0]) if chosen_action == 1: a[1] = +1.0 # up if chosen_action == 2: a[2] = +0.8 # down: 0.8 as recommended by the environment's built-in demo if chosen_action == 3: a[0] = -1.0 # left if chosen_action == 4: a[0] = +1.0 # right action = a #print("chosen_action", chosen_action, action) obs, reward, done, _ = super(VAERacingStack, self)._step(action) if self.cumulative_frames is not None: self.cumulative_frames.pop(0) self.cumulative_frames.append(_process_frame_green(obs)) else: self.cumulative_frames = [_process_frame_green(obs)] * FRAME_STACK self.z = z = _compress_frames(self.cumulative_frames, self.vae) if self.full_episode: return z, reward, False, {} self._internal_counter += 1 if self._internal_counter > TIME_LIMIT: done = True #img = self._get_image(self.z, self.cumulative_frames) #imageio.imwrite('dump/'+('%0*d' % (4, self._internal_counter))+'.png', img) return z, reward, done, {}
class VAERacingWorld(CarRacing): def __init__(self, full_episode=False, pure_world=False): super(VAERacingWorld, self).__init__() self._internal_counter = 0 self.z_size = games['vae_racing'].input_size self.vae = ConvVAE(batch_size=1, z_size=self.z_size, gpu_mode=False, is_training=False, reuse=True) self.vae.load_json('vae/vae_' + str(self.z_size) + '.json') self.full_episode = full_episode if pure_world: high = np.array([np.inf] * 10) else: high = np.array([np.inf] * (self.z_size + 10)) self.observation_space = Box(-high, high) self._has_rendered = False self.real_frame = None self.world_model = SimpleWorldModel(obs_size=16, action_size=3, hidden_size=10) world_model_path = "./log/learn_vae_racing.cma.4.64.best.json" self.world_model.load_model(world_model_path) self.pure_world_mode = pure_world def _reset(self): self._internal_counter = 0 self._has_rendered = False self.real_frame = None return super(VAERacingWorld, self)._reset() def _render(self, mode='human', close=False): if mode == 'human' or mode == 'rgb_array': self._has_rendered = True return super(VAERacingWorld, self)._render(mode=mode, close=close) def _step(self, action): if not self._has_rendered: self._render("rgb_array") self._has_rendered = False old_action = [0, 0, 0] if action is not None: old_action = np.copy(action) action[0] = _clip(action[0], lo=-1.0, hi=+1.0) action[1] = _clip(action[1], lo=-1.0, hi=+1.0) action[1] = (action[1] + 1.0) / 2.0 action[2] = _clip(action[2]) obs, reward, done, _ = super(VAERacingWorld, self)._step(action) result = np.copy(_process_frame(obs)).astype(np.float) / 255.0 result = result.reshape(1, 64, 64, 3) self.real_frame = result #z = self.vae.encode(result).flatten() mu, logvar = self.vae.encode_mu_logvar(result) mu = mu[0] logvar = logvar[0] s = logvar.shape z = mu + np.exp(logvar / 2.0) * np.random.randn(*s) if self.full_episode: if MU_MODE: return mu, reward, False, {} else: return z, reward, False, {} self._internal_counter += 1 if self._internal_counter > TIME_LIMIT: done = True if MU_MODE: z = mu self.world_model.predict_next_obs(z, old_action) if self.pure_world_mode: z = np.copy(self.world_model.hidden_state) else: z = np.concatenate([z, self.world_model.hidden_state], axis=0) return z, reward, done, {}
class Model: ''' simple one layer model for car racing ''' def __init__(self, load_model=True): # For Mac # self.env_name = "/Users/intuinno/codegit/pushBlock/app/mac/VisualPushBlockContinuous" # For linux self.env_name = "/home/intuinno/codegit/pushblock/app/linux/pushblock.x86_64" self.vae = ConvVAE(batch_size=1, gpu_mode=False, is_training=False, reuse=True) self.rnn = MDNRNN(hps_sample, gpu_mode=False, reuse=True) if load_model: self.vae.load_json('vae/vae.json') self.rnn.load_json('rnn/rnn.json') self.state = rnn_init_state(self.rnn) self.rnn_mode = True self.input_size = rnn_output_size(EXP_MODE) self.z_size = 32 if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer self.hidden_size = 40 self.weight_hidden = np.random.randn(self.input_size, self.hidden_size) self.bias_hidden = np.random.randn(self.hidden_size) self.weight_output = np.random.randn(self.hidden_size, 3) self.bias_output = np.random.randn(3) self.param_count = ((self.input_size+1)*self.hidden_size) + (self.hidden_size*3+3) else: self.weight = np.random.randn(self.input_size, 3) self.bias = np.random.randn(3) self.param_count = (self.input_size)*3+3 self.render_mode = False def make_env(self, seed=-1, render_mode=False, full_episode=False, workerid=1): self.render_mode = render_mode self.env = make_env(self.env_name, seed=seed, render_mode=render_mode, full_episode=full_episode, workerid=workerid) def reset(self): self.state = rnn_init_state(self.rnn) def encode_obs(self, obs): # convert raw obs to z, mu, logvar result = np.copy(obs).astype(np.float)/255.0 result = result.reshape(1, 64, 64, 3) mu, logvar = self.vae.encode_mu_logvar(result) mu = mu[0] logvar = logvar[0] s = logvar.shape z = mu + np.exp(logvar/2.0) * np.random.randn(*s) return z, mu, logvar def get_action(self, z): h = rnn_output(self.state, z, EXP_MODE) ''' action = np.dot(h, self.weight) + self.bias action[0] = np.tanh(action[0]) action[1] = sigmoid(action[1]) action[2] = clip(np.tanh(action[2])) ''' if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden) action = np.tanh(np.dot(h, self.weight_output) + self.bias_output) else: action = np.tanh(np.dot(h, self.weight) + self.bias) self.state = rnn_next_state(self.rnn, z, action, self.state) return action def set_model_params(self, model_params): if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer params = np.array(model_params) cut_off = (self.input_size+1)*self.hidden_size params_1 = params[:cut_off] params_2 = params[cut_off:] self.bias_hidden = params_1[:self.hidden_size] self.weight_hidden = params_1[self.hidden_size:].reshape(self.input_size, self.hidden_size) self.bias_output = params_2[:3] self.weight_output = params_2[3:].reshape(self.hidden_size, 3) else: self.bias = np.array(model_params[:3]) self.weight = np.array(model_params[3:]).reshape(self.input_size, 3) def load_model(self, filename): with open(filename) as f: data = json.load(f) print('loading file %s' % (filename)) self.data = data model_params = np.array(data[0]) # assuming other stuff is in data self.set_model_params(model_params) def get_random_model_params(self, stdev=0.1): #return np.random.randn(self.param_count)*stdev return np.random.standard_cauchy(self.param_count)*stdev # spice things up def init_random_model_params(self, stdev=0.1): params = self.get_random_model_params(stdev=stdev) self.set_model_params(params) vae_params = self.vae.get_random_model_params(stdev=stdev) self.vae.set_model_params(vae_params) rnn_params = self.rnn.get_random_model_params(stdev=stdev) self.rnn.set_model_params(rnn_params)
if not os.path.exists(output_dir): os.mkdir(output_dir) np.set_printoptions(precision=4, edgeitems=6, linewidth=100, suppress=True) reset_graph() # Third, Build the VAE vae = ConvVAE(z_size=z_size, batch_size=1, is_training=False, reuse=False, gpu_mode=False) vae.load_json(os.path.join('vae', 'vae.json')) # Fourth, build the RNN hps_atari_sample = hps_sample._replace(input_seq_width=z_size+na) OUTWIDTH = hps_atari_sample.output_seq_width rnn = MDNRNN(hps_atari_sample, gpu_mode=False) rnn.load_json(os.path.join('rnn', 'rnn.json')) print("All model loaded.") # Fifth, run the evaluation. -> We have no predictions about the first frame. start = time.time() state = rnn_init_state(rnn) # initialize the state.
class DQN: """ DNN agent for Active Inference The archtecture consists of P model and A model P model(Perception): pretraining VAE encoder part A model(Action): DQN solve expected Free energy like Q function """ def __init__(self, env, batchsize=64, input_size=(64,64), num_frame_stack=4, gamma=0.95, frame_skip=1, train_freq=4, initial_epsilon=1.0, min_epsilon=0.1, render=True, epsilon_decay_steps=int(1e6), min_experience_size=int(1e3), experience_capacity=int(1e5), network_update_freq=5000, regularization=1e-6, optimizer_params=None, action_map=None ): self.vae = ConvVAE(batch_size=batchsize, gpu_mode=False, is_training=False, reuse=True) self.vae.load_json('vae/vae.json') if action_map is not None: self.dim_actions = len(action_map) else: self.dim_actions = env.action_space.n self.network_update_freq = network_update_freq self.action_map = action_map self.env = env self.batchsize = batchsize self.num_frame_stack = num_frame_stack self.gamma = gamma self.frame_skip = frame_skip self.train_freq = train_freq self.initial_epsilon = initial_epsilon self.min_epsilon = min_epsilon self.epsilon_decay_steps = epsilon_decay_steps self.render = render self.min_experience_size = min_experience_size self.input_size = input_size self.regularization = regularization self.optimizer_params = optimizer_params or dict(learning_rate=0.0004, epsilon=1e-7) self.do_training = True self.playing_epsilon = 0.0 self.session = None self.state_size = (self.num_frame_stack,) + self.input_size self.global_counter = 0 self.episode_counter =0 def build_graph(self): input_dim_with_batch = (self.batchsize, self.num_frame_stack) + self.input_size input_dim_general = (None, self.num_frame_stack) + self.input_size self.input_prev_state = tf.placeholder(tf.float32, input_dim_general, "prev_state") self.input_next_state = tf.placeholder(tf.float32, input_dim_with_batch, "next_state") self.input_reward = tf.placeholder(tf.float32, self.batchsize, "reward") self.input_actions = tf.placeholder(tf.int32, self.batchsize, "actions") self.input_done_mask = tf.placeholder(tf.int32, self.batchsize, "done_mask") # The target Q-values with tf.variable_scope("fixed"): qsa_targets = self.create_network(self.input_next_state, trainable=False) # The estimate Q-values with tf.variable_scope("train"): qsa_estimates = self.create_network(self.input_prev_state, trainable=True) self.best_action = tf.argmax(qsa_estimates, axis=1) not_done = tf.cast(tf.logical_not(tf.cast(self.input_done_mask, "bool")), "float32") q_target = np.reduce_max(qsa_targets, -1) * self.gamma * not_done + self.input_reward action_slice = tf.stack([tf.range(0, self.batchsize), self.input_actions], axis=1) q_estimates_for_input_action = tf.gather_nd(qsa_estimates, action_slice) training_loss = tf.nn.l2_loss(q_target - q_estimates_for_input_action) / self.batchsize optimizer = tf.train.AdamOptimizer(**(self.optimizer_params)) reg_loss = tf.add_n(tf.losses.get_regularization_losses()) self.train_op = optimizer.minimize(reg_loss + training_loss) train_params = self.get_variables("train") fixed_params = self.get_variables("fixed") # test assert (len(train_params)) == len(fixed_params) self.copy_network_ops = [tf.assign(fixed_v, train_v) for train_v, fixed_v in zip(train_params, fixed_params)] ### Create estimate Q-network and target Q-network def create_network(self, input, trainable): if trainable: wr = slim.l2l2_regularizer(self.regularization) else: wr = None input_t = tf.transpose(input, [0, 2, 3, 1]) net = slim.conv2d(input_t, 8, (7, 7), data_format="NHWC", activation_fn=tf.nn.relu, stride=3, weights_regularizer=wr, trainable=trainable) net = slim.max_pool2d(net, 2, 2) net = slim.conv2d(net, 16, (3, 3), data_format="NHWC", activation_fn=tf.nn.relu, weights_regularizer=wr, trainable=trainable) net = slim.max_pool2d(net, 2, 2) net = slim.flatten(net) net = slim.fully_connected(net, 256, activation_fn=tf.nn.relu, weights_regularizer=wr, trainable=trainable) q_state_action_values = slim.fully_connected(net, self.dim_actions, activation_fn=None, weights_regularizer=wr, trainable=trainable) return q_state_action_values def get_random_action(self): return np.random.choice(self.dim_actions) def get_epsilon(self): if not self.do_training: return self.playing_epsilon elif self.global_counter >= self.epsilon_decay_steps: return self.min_epsilon else: # linear decay r = 1.0 - self.global_counter / float(self.epsilon_decay_steps) return self.min_epsilon + (self.initial_epsilon - self.min_epsilon) * r ### Training operation with data from Replay Memory def train(self): batch = self.exp_history.sample_mini_batch(self.batchsize) fd = { self.input_reward: "reward", self.input_prev_state: "prev_state", self.input_next_state: "next_state", self.input_actions: "actions", self.input_done_mask: "done_mask" } fd1 = {ph: batch[k] for ph, k in fd.items()} self.session.run([self.train_op], fd1) def play_episode(self): # Replay Memory eh = ( self.exp_history if self.do_training else self.playing_cache ) total_reward = 0 frames_in_episode = 0 # Start environment first_frame = self.env.reset() first_frame_pp = self.process_image(first_frame) eh.start_new_episode(first_frame_pp) while True: if np.random.rand() > self.get_epsilon(): action_idx = self.session.run( self.best_action, {self.input_prev_state: eh.current_state()[np.newaxis, ...]} )[0] else: action_idx = self.get_random_action() if self.action_map is not None: action = self.action_map[action_idx] else: action = action_idx reward = 0 for _ in range(self.frame_skip): observation, r, done, info = self.env.step(action) if self.render: self.env.render() reward += r if done: break early_done, punishment = self.check_early_stop(reward, total_reward) if early_done: reward += punishment done = done or early_done total_reward += reward frames_in_episode += 1 eh.add_experience(self.process_image(observation), action_idx, done, reward) if self.do_training: self.global_counter += 1 if self.global_counter % self.network_update_freq: self.update_target_network() train_cond = ( self.exp_history.counter >= self.min_experience_size and self.global_counter % self.train_freq == 0 ) if train_cond: self.train() if done: if self.do_training: self.episode_counter += 1 return total_reward, frames_in_episode
class MiniNetwork(object): def __init__(self, sess=None, summary_writer=tf.summary.FileWriter("logs/"), rl_training=False, reuse=False, cluster=None, index=0, device='/gpu:0', ppo_load_path=None, ppo_save_path=None, load_worldmodel=True, ntype='worldmodel'): self.policy_model_path_load = ppo_load_path + ntype self.policy_model_path_save = ppo_save_path + ntype self.rl_training = rl_training self.use_norm = True self.reuse = reuse self.sess = sess self.cluster = cluster self.index = index self.device = device self.vae = ConvVAE(batch_size=1, gpu_mode=False, is_training=False, reuse=True) self.rnn = MDNRNN(hps_sample, gpu_mode=False, reuse=True) if load_worldmodel: self.vae.load_json('vae/vae.json') self.rnn.load_json('rnn/rnn.json') self.input_size = rnn_output_size(EXP_MODE) self._create_graph() self.rl_saver = tf.train.Saver() self.summary_writer = summary_writer def initialize(self): init_op = tf.global_variables_initializer() self.sess.run(init_op) def reset_old_network(self): self.policy_ppo.assign_policy_parameters() self.policy_ppo.reset_mean_returns() self.sess.run(self.results_sum.assign(0)) self.sess.run(self.game_num.assign(0)) def _create_graph(self): if self.reuse: tf.get_variable_scope().reuse_variables() assert tf.get_variable_scope().reuse worker_device = "/job:worker/task:%d" % self.index + self.device with tf.device( tf.train.replica_device_setter(worker_device=worker_device, cluster=self.cluster)): self.results_sum = tf.get_variable( name="results_sum", shape=[], initializer=tf.zeros_initializer) self.game_num = tf.get_variable(name="game_num", shape=[], initializer=tf.zeros_initializer) self.global_steps = tf.get_variable( name="global_steps", shape=[], initializer=tf.zeros_initializer) self.win_rate = self.results_sum / self.game_num self.mean_win_rate = tf.summary.scalar( 'mean_win_rate_dis', self.results_sum / self.game_num) self.merged = tf.summary.merge([self.mean_win_rate]) mini_scope = "MiniPolicyNN" with tf.variable_scope(mini_scope): ob_space = self.input_size act_space_array = _SIZE_MINI_ACTIONS self.policy = Policy_net('policy', self.sess, ob_space, act_space_array) self.policy_old = Policy_net('old_policy', self.sess, ob_space, act_space_array) self.policy_ppo = PPOTrain('PPO', self.sess, self.policy, self.policy_old, lr=P.mini_lr, epoch_num=P.mini_epoch_num) var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) self.policy_saver = tf.train.Saver(var_list=var_list) def Update_result(self, result_list): win = 0 for i in result_list: if i > 0: win += 1 self.sess.run(self.results_sum.assign_add(win)) self.sess.run(self.game_num.assign_add(len(result_list))) def Update_summary(self, counter): print("Update summary........") policy_summary = self.policy_ppo.get_summary_dis() self.summary_writer.add_summary(policy_summary, counter) summary = self.sess.run(self.merged) self.summary_writer.add_summary(summary, counter) self.sess.run(self.global_steps.assign(counter)) print("Update summary finished!") steps = int(self.sess.run(self.global_steps)) win_game = int(self.sess.run(self.results_sum)) all_game = int(self.sess.run(self.game_num)) win_rate = win_game / float(all_game) return steps, win_rate def get_win_rate(self): return float(self.sess.run(self.win_rate)) def Update_policy(self, buffer): self.policy_ppo.ppo_train_dis(buffer.observations, buffer.tech_actions, buffer.rewards, buffer.values, buffer.values_next, buffer.gaes, buffer.returns, verbose=False) def get_global_steps(self): return int(self.sess.run(self.global_steps)) def save_policy(self): self.policy_saver.save(self.sess, self.policy_model_path_save) print("policy has been saved in", self.policy_model_path_save) def restore_policy(self): self.policy_saver.restore(self.sess, self.policy_model_path_load) print("Restore policy from", self.policy_model_path_load)
class Model: ''' simple one layer model for car racing ''' def __init__(self, load_model=True): self.env_name = './VisualPushBlock_withBlock_z_info.x86_64' #'./VisualPushBlock.x86_64' self.vae = ConvVAE(batch_size=1, gpu_mode=False, is_training=False, reuse=True) self.rnn = MDNRNN(hps_sample, gpu_mode=False, reuse=True) if load_model: self.vae.load_json('vae/vae.json') self.rnn.load_json('rnn/rnn.json') self.state = rnn_init_state(self.rnn) self.rnn_mode = True self.input_size = rnn_output_size(EXP_MODE) self.z_size = z_size if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer ###CHANGE is made here self.hidden_size = 40 self.weight_hidden = np.random.randn(self.input_size, self.hidden_size) self.bias_hidden = np.random.randn(self.hidden_size) self.weight_output = np.random.randn(self.hidden_size, ACTION_SIZE) self.bias_output = np.random.randn(ACTION_SIZE) self.param_count = ((self.input_size + 1) * self.hidden_size) + ( self.hidden_size * ACTION_SIZE + ACTION_SIZE) else: self.weight = np.random.randn(self.input_size, ACTION_SIZE) self.bias = np.random.randn(ACTION_SIZE) self.param_count = (self.input_size) * ACTION_SIZE + ACTION_SIZE self.render_mode = False def make_env(self, seed=-1, render_mode=False, full_episode=False, worker_id=0): self.render_mode = render_mode self.env = make_env(self.env_name, seed=seed, render_mode=render_mode, full_episode=full_episode, worker_id=worker_id) def reset(self): self.state = rnn_init_state(self.rnn) def encode_obs(self, obs): # convert raw obs to z, mu, logvar #result = np.copy(obs).astype(np.float)/255.0 result = np.copy(obs).astype(np.float) result = result.reshape(1, IMAGE_W, IMAGE_H, 3) mu, logvar = self.vae.encode_mu_logvar(result) mu = mu[0] logvar = logvar[0] s = logvar.shape z = mu + np.exp(logvar / 2.0) * np.random.randn(*s) return z, mu, logvar def get_action(self, z): h = rnn_output(self.state, z, EXP_MODE) #print('h', h.shape, h) ''' action = np.dot(h, self.weight) + self.bias action[0] = np.tanh(action[0]) action[1] = sigmoid(action[1]) action[2] = clip(np.tanh(action[2])) ''' if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden) action = np.tanh(np.dot(h, self.weight_output) + self.bias_output) else: '''print(h.shape) print(self.weight.shape) print(self.bias.shape)''' action = np.tanh(np.dot(h, self.weight) + self.bias) '''for i in range(ACTION_SIZE): action[i] = (action[i]+1.0) / 2.0 #all actions value are in range 0 to 1''' #action[2] = clip(action[2]) self.state = rnn_next_state(self.rnn, z, action, self.state) #update weights of MDN-RNN return action def set_model_params(self, model_params): if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer params = np.array(model_params) cut_off = (self.input_size + 1) * self.hidden_size params_1 = params[:cut_off] params_2 = params[cut_off:] self.bias_hidden = params_1[:self.hidden_size] self.weight_hidden = params_1[self.hidden_size:].reshape( self.input_size, self.hidden_size) self.bias_output = params_2[:ACTION_SIZE] self.weight_output = params_2[ACTION_SIZE:].reshape( self.hidden_size, ACTION_SIZE) else: self.bias = np.array(model_params[:ACTION_SIZE]) self.weight = np.array(model_params[ACTION_SIZE:]).reshape( self.input_size, ACTION_SIZE) def load_model(self, filename): with open(filename) as f: data = json.load(f) print('loading file %s' % (filename)) self.data = data model_params = np.array(data[0]) # assuming other stuff is in data self.set_model_params(model_params) def get_random_model_params(self, stdev=0.1): #return np.random.randn(self.param_count)*stdev return np.random.standard_cauchy( self.param_count) * stdev # spice things up def init_random_model_params(self, stdev=0.1): params = self.get_random_model_params(stdev=stdev) self.set_model_params(params) vae_params = self.vae.get_random_model_params(stdev=stdev) self.vae.set_model_params(vae_params) rnn_params = self.rnn.get_random_model_params(stdev=stdev) self.rnn.set_model_params(rnn_params)
class Model: ''' simple one layer model for car racing ''' def __init__(self, arglist, action_space, scope, load_model=True): self.action_space = action_space self.arglist = arglist self.vae = ConvVAE(batch_size=1, gpu_mode=False, is_training=False, reuse=True) hps_sample = hps_model._replace( batch_size=1, input_seq_width=32 + arglist.action_space + (arglist.agent_num - 1) * arglist.action_space * arglist.timestep, max_seq_len=1, use_recurrent_dropout=0, is_training=0) self.rnn = MDNRNN(hps_sample, gpu_mode=False, reuse=True) if load_model: self.vae.load_json(arglist.vae_model_dir) self.rnn.load_json(arglist.rnn_model_dir) self.state = rnn_init_state(self.rnn) self.rnn_mode = True if arglist.inference: self.input_size = rnn_output_size( EXP_MODE) + (arglist.agent_num - 1) * arglist.action_space else: self.input_size = rnn_output_size(EXP_MODE) self.z_size = 32 # action trajectories recording self.act_traj = [ collections.deque(np.zeros( (arglist.timestep, arglist.action_space)), maxlen=arglist.timestep) ] * (arglist.agent_num - 1) self.oppo_model = Oppo_Model(arglist.agent_num, arglist.timestep, arglist.action_space, arglist.action_space, "oppo_model_{}".format(scope)) self.inference = arglist.inference if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer self.hidden_size = 40 self.weight_hidden = np.random.randn(self.input_size, self.hidden_size) self.bias_hidden = np.random.randn(self.hidden_size) self.weight_output = np.random.randn(self.hidden_size, self.action_space) self.bias_output = np.random.randn(self.action_space) self.param_count = ((self.input_size + 1) * self.hidden_size) + ( self.hidden_size * self.action_space + self.action_space) else: self.weight = np.random.randn(self.input_size, self.action_space) self.bias = np.random.randn(self.action_space) self.param_count = ( self.input_size) * self.action_space + self.action_space def reset(self): self.state = rnn_init_state(self.rnn) # self.oppo_state = lstm_init_state(self.oppo_model) def encode_obs(self, obs): # convert raw obs to z, mu, logvar result = np.copy(obs).astype(np.float) / 255.0 result = result.reshape(1, 64, 64, 3) mu, logvar = self.vae.encode_mu_logvar(result) mu = mu[0] logvar = logvar[0] s = logvar.shape z = mu + np.exp(logvar / 2.0) * np.random.randn(*s) return z, mu, logvar def get_action(self, z): h = rnn_output(self.state, z, EXP_MODE) if self.arglist.inference: oppo_intents = [] for i in range(self.arglist.agent_num - 1): act_traj = self.act_traj[i] intent = self.oppo_model.get_inference(act_traj) oppo_intents.append(intent) oppo_intents = np.reshape( oppo_intents, ((self.arglist.agent_num - 1) * self.arglist.action_space)) ''' action = np.dot(h, self.weight) + self.bias action[0] = np.tanh(action[0]) action[1] = sigmoid(action[1]) action[2] = clip(np.tanh(action[2])) ''' #Oppo intent shape (batch_size, agent_num, action_space) # reshape oppo_intent agent_num * batch_size * action_space controller_input = np.concatenate((h, oppo_intents)) else: controller_input = h if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer x = np.tanh( np.dot(controller_input, self.weight_hidden) + self.bias_hidden) action = np.tanh(np.dot(x, self.weight_output) + self.bias_output) else: action = np.tanh(np.dot(controller_input, self.weight) + self.bias) for i in range(self.action_space): action[i] = clip(action[i]) self.state = rnn_next_state(self.rnn, z, action, self.act_traj, self.state) # self.oppo_state = oppo_next_state(self.oppo_model, action, self.act_traj, self.oppo_state) # epsilon exploration if np.random.uniform(0, 1) < 0.2: action = [np.random.uniform(-3, 3)] * len(action) return action def set_model_params(self, model_params): if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer params = np.array(model_params) cut_off = (self.input_size + 1) * self.hidden_size params_1 = params[:cut_off] params_2 = params[cut_off:] self.bias_hidden = params_1[:self.hidden_size] self.weight_hidden = params_1[self.hidden_size:].reshape( self.input_size, self.hidden_size) self.bias_output = params_2[:self.action_space] self.weight_output = params_2[self.action_space:].reshape( self.hidden_size, self.action_space) else: self.bias = np.array(model_params[:self.action_space]) self.weight = np.array(model_params[self.action_space:]).reshape( self.input_size, self.action_space) def load_model(self, filename): with open(filename) as f: data = json.load(f) print('loading file %s' % (filename)) self.data = data model_params = np.array(data[0]) # assuming other stuff is in data self.set_model_params(model_params) def get_random_model_params(self, stdev=0.1): #return np.random.randn(self.param_count)*stdev return np.random.standard_cauchy( self.param_count) * stdev # spice things up def init_random_model_params(self, stdev=0.1): params = self.get_random_model_params(stdev=stdev) self.set_model_params(params) vae_params = self.vae.get_random_model_params(stdev=stdev) self.vae.set_model_params(vae_params) rnn_params = self.rnn.get_random_model_params(stdev=stdev) self.rnn.set_model_params(rnn_params)
class Model: ''' simple one layer model for translating game state to actions''' def __init__(self, load_model=True): self.env_name = "Pong" self._make_env() self.vae = ConvVAE(batch_size=1, gpu_mode=False, is_training=False, reuse=True) hps_sample_dynamic = hps_sample._replace(num_actions=self.num_actions) self.rnn = MDNRNN(hps_sample_dynamic, gpu_mode=False, reuse=True) if load_model: self.vae.load_json('vae/vae.json') self.rnn.load_json('rnn/rnn.json') self.state = rnn_init_state(self.rnn) self.rnn_mode = True self.input_size = rnn_output_size(EXP_MODE) self.z_size = 32 if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer raise Exception("not ported for atari") self.hidden_size = 40 self.weight_hidden = np.random.randn(self.input_size, self.hidden_size) self.bias_hidden = np.random.randn(self.hidden_size) self.weight_output = np.random.randn(self.hidden_size, self.num_actions) self.bias_output = np.random.randn(self.num_actions) self.param_count = ((self.input_size + 1) * self.hidden_size) + ( (self.hidden_size + 1) * self.num_actions) else: # TODO: Not known until env.action_space is queried... self.weight = np.random.randn(self.input_size, self.num_actions) self.bias = np.random.randn(self.num_actions) self.param_count = (self.input_size + 1) * self.num_actions self.render_mode = False def _make_env(self): self.render_mode = render_mode self.env = make_env(self.env_name) self.num_actions = self.env.action_space.n def make_env(self): pass #TODO (Chazzz): eventually remove def reset(self): self.state = rnn_init_state(self.rnn) def encode_obs(self, obs): # convert raw obs to z, mu, logvar result = np.copy(obs).astype(np.float) / 255.0 result = result.reshape(1, 64, 64, 1) mu, logvar = self.vae.encode_mu_logvar(result) mu = mu[0] logvar = logvar[0] s = logvar.shape z = mu + np.exp(logvar / 2.0) * np.random.randn(*s) return z, mu, logvar def get_action(self, z): h = rnn_output(self.state, z, EXP_MODE) # print(len(h), " h:", h) #TODO: 256+32 (the 32 comes first) # So we could have 288*2*18 params, or 288*2*environment.action_space.n (6 for Pong) ''' action = np.dot(h, self.weight) + self.bias action[0] = np.tanh(action[0]) action[1] = sigmoid(action[1]) action[2] = clip(np.tanh(action[2])) ''' if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer raise Exception("Not ported to atari") # h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden) # action = np.tanh(np.dot(h, self.weight_output) + self.bias_output) else: # could probabilistically sample from softmax, but greedy action = np.argmax(np.matmul(h, self.weight) + self.bias) # action[1] = (action[1]+1.0) / 2.0 # action[2] = clip(action[2]) # print("Action:", action) action_one_hot = np.zeros(self.num_actions) action_one_hot[action] = 1 # print("Action hot:", action_one_hot) self.state = rnn_next_state(self.rnn, z, action_one_hot, self.state) return action def set_model_params(self, model_params): if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer params = np.array(model_params) cut_off = (self.input_size + 1) * self.hidden_size params_1 = params[:cut_off] params_2 = params[cut_off:] self.bias_hidden = params_1[:self.hidden_size] self.weight_hidden = params_1[self.hidden_size:].reshape( self.input_size, self.hidden_size) self.bias_output = params_2[:self.num_actions] self.weight_output = params_2[self.num_actions:].reshape( self.hidden_size, self.num_actions) else: self.bias = np.array(model_params[:self.num_actions]) self.weight = np.array(model_params[self.num_actions:]).reshape( self.input_size, self.num_actions) def load_model(self, filename): with open(filename) as f: data = json.load(f) print('loading file %s' % (filename)) self.data = data model_params = np.array(data[0]) # assuming other stuff is in data self.set_model_params(model_params) def get_random_model_params(self, stdev=0.1): #return np.random.randn(self.param_count)*stdev return np.random.standard_cauchy( self.param_count) * stdev # spice things up def init_random_model_params(self, stdev=0.1): params = self.get_random_model_params(stdev=stdev) self.set_model_params(params) vae_params = self.vae.get_random_model_params(stdev=stdev) self.vae.set_model_params(vae_params) rnn_params = self.rnn.get_random_model_params(stdev=stdev) self.rnn.set_model_params(rnn_params)
class Model: ''' simple one layer model for car racing ''' def __init__(self, load_model=True): self.env_name = 'Carracing' self.vae = ConvVAE(batch_size=1, gpu_mode=False, is_training=False, reuse=True) self.rnn = MDNRNN(hps_sample, gpu_mode=False, reuse=True) if load_model: self.vae.load_json('vae/vae.json') self.rnn.load_json('rnn/rnn.json') self.state = rnn_init_state(self.rnn) self.rnn_mode = True self.input_size = rnn_output_size(EXP_MODE) self.z_size = 16 if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer self.hidden_size = 40 self.weight_hidden = np.random.randn(self.input_size, self.hidden_size) self.bias_hidden = np.random.randn(self.hidden_size) self.weight_output = np.random.randn(self.hidden_size, 3) self.bias_output = np.random.randn(3) self.param_count = ((self.input_size + 1) * self.hidden_size) + (self.hidden_size * 3 + 3) else: self.weight = np.random.randn(self.input_size, 3) self.bias = np.random.randn(3) self.param_count = (self.input_size) * 3 + 3 self.render_mode = False def make_env(self, client, seed=-1, render_mode=False, full_episode=False): self.client = client self.render_mode = render_mode self.env = TorcsEnv( vision=False, throttle=True, gear_change=False ) #make_env(self.env_name, seed=seed, render_mode=render_mode, full_episode=full_episode) def reset(self): self.state = rnn_init_state(self.rnn) def encode_obs(self, obs): dictlist = [] for key, value in obs.items(): if key == 'opponents' or key == 'track' or key == 'wheelSpinVel' or key == 'focus': dictlist = dictlist + value else: dictlist.append(value) obs = dictlist result = np.copy(obs).astype(np.float) / 255.0 result = result.reshape(1, 79, 1) mu, logvar = self.vae.encode_mu_logvar(result) mu = mu[0] logvar = logvar[0] s = logvar.shape z = mu + np.exp(logvar / 2.0) * np.random.randn(*s) return z, mu, logvar def get_action(self, z): h = rnn_output(self.state, z, EXP_MODE) ''' action = np.dot(h, self.weight) + self.bias action[0] = np.tanh(action[0]) action[1] = sigmoid(action[1]) action[2] = clip(np.tanh(action[2])) ''' if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden) action = np.tanh(np.dot(h, self.weight_output) + self.bias_output) else: action = np.tanh(np.dot(h, self.weight) + self.bias) action[0] = action[0] action[1] = (action[1] + 1.0) / 2.0 action[2] = clip(action[2]) self.state = rnn_next_state(self.rnn, z, action, self.state) return action def set_model_params(self, model_params): if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer params = np.array(model_params) cut_off = (self.input_size + 1) * self.hidden_size params_1 = params[:cut_off] params_2 = params[cut_off:] self.bias_hidden = params_1[:self.hidden_size] self.weight_hidden = params_1[self.hidden_size:].reshape( self.input_size, self.hidden_size) self.bias_output = params_2[:3] self.weight_output = params_2[3:].reshape(self.hidden_size, 3) else: self.bias = np.array(model_params[:3]) self.weight = np.array(model_params[3:]).reshape( self.input_size, 3) def load_model(self, filename): with open(filename) as f: data = json.load(f) print('loading file %s' % (filename)) self.data = data model_params = np.array(data[0]) # assuming other stuff is in data self.set_model_params(model_params) def get_random_model_params(self, stdev=0.1): #return np.random.randn(self.param_count)*stdev return np.random.standard_cauchy( self.param_count) * stdev # spice things up def init_random_model_params(self, stdev=0.1): params = self.get_random_model_params(stdev=stdev) self.set_model_params(params) vae_params = self.vae.get_random_model_params(stdev=stdev) self.vae.set_model_params(vae_params) rnn_params = self.rnn.get_random_model_params(stdev=stdev) self.rnn.set_model_params(rnn_params)
filelist.sort() filelist = filelist[0:1000] dataset, action_dataset = load_raw_data_list(filelist) reset_graph() vae = ConvVAE(z_size=z_size, batch_size=batch_size, learning_rate=learning_rate, kl_tolerance=kl_tolerance, is_training=False, reuse=False, gpu_mode=True) # use GPU on batchsize of 1000 -> much faster vae.load_json(os.path.join(model_path_name, 'vae.json')) mu_dataset = [] logvar_dataset = [] for i in range(len(dataset)): data_batch = dataset[i] mu, logvar, z = encode_batch(data_batch) mu_dataset.append(mu.astype(np.float16)) logvar_dataset.append(logvar.astype(np.float16)) if ((i + 1) % 100 == 0): print(i + 1) action_dataset = np.array(action_dataset) mu_dataset = np.array(mu_dataset) logvar_dataset = np.array(logvar_dataset)
from baselines.ddpg.memory import Memory from baselines.ddpg.noise import AdaptiveParamNoiseSpec, NormalActionNoise, OrnsteinUhlenbeckActionNoise from baselines.common import set_global_seeds import baselines.common.tf_util as U from baselines import logger import numpy as np try: from mpi4py import MPI except ImportError: MPI = None vae = ConvVAE(batch_size=1, gpu_mode=False, is_training=False, reuse=True) rnn = MDNRNN(hps_sample, gpu_mode=False, reuse=True) vae.load_json('vae/vae.json') rnn.load_json('rnn/rnn.json') def learn(network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=20, nb_rollout_steps=100, reward_scale=1.0, render=False, render_eval=False, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=True,
dataset, action_dataset, oppo_action_dataset = load_raw_data_list( filelist, arglist) reset_graph() if arglist.use_vae: vae = ConvVAE( z_size=arglist.z_size, batch_size=arglist.batch_size, learning_rate=arglist.lr, kl_tolerance=arglist.kl_tolerance, is_training=False, reuse=False, gpu_mode=True) # use GPU on batchsize of 1000 -> much faster vae.load_json(os.path.join(arglist.vae_path, 'vae.json')) mu_dataset = [] logvar_dataset = [] action_dataset_real = [] oppo_action_dataset_real = [] for i in range(len(dataset)): data_batch = dataset[i] if len(data_batch) <= arglist.batch_size: continue else: data_batch = data_batch[:arglist.batch_size] if arglist.use_vae: mu, logvar, z = encode_batch(data_batch, arglist) mu_dataset.append(mu.astype(np.float16)) logvar_dataset.append(logvar.astype(np.float16))
class Model: ''' simple one layer model for car racing ''' def __init__(self, arglist): self.env_name = arglist.game self.vae = ConvVAE(batch_size=1, gpu_mode=False, is_training=False, reuse=True) self.vae.load_json(arglist.vae_file) self.rnn = MDNRNN(hps_sample, gpu_mode=False, reuse=True) self.rnn.load_json(arglist.rnn_file) self.state = rnn_init_state(self.rnn) self.rnn_mode = True self.input_size = rnn_output_size(EXP_MODE) self.z_size = 32 if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer self.hidden_size = 40 self.weight_hidden = np.random.randn(self.input_size, self.hidden_size) self.bias_hidden = np.random.randn(self.hidden_size) self.weight_output = np.random.randn(self.hidden_size, 2) self.bias_output = np.random.randn(2) self.param_count = ((self.input_size + 1) * self.hidden_size) + (self.hidden_size * 2 + 2) else: self.weight = np.random.randn(self.input_size, 2) self.bias = np.random.randn(2) self.param_count = (self.input_size) * 2 + 2 self.render_mode = False def make_env(self, seed=-1, render_mode=False): self.render_mode = render_mode self.env = make_env(self.env_name, seed=seed, render_mode=render_mode) def reset(self): self.state = rnn_init_state(self.rnn) def encode_obs(self, obs): # convert raw obs to z, mu, logvar result = np.copy(obs).astype(np.float) / 255.0 result = result.reshape(1, 64, 64, 3) mu, logvar = self.vae.encode_mu_logvar(result) mu = mu[0] logvar = logvar[0] s = logvar.shape z = mu + np.exp(logvar / 2.0) * np.random.randn(*s) return z, mu, logvar def decode_obs(self, z): # decode the latent vector img = self.vae.decode(z.reshape(1, self.z_size)) * 255. img = np.round(img).astype(np.uint8) img = img.reshape(64, 64, 3) return img def get_action(self, z, arglist): h = rnn_output(self.state, z, EXP_MODE) ''' action = np.dot(h, self.weight) + self.bias action[0] = np.tanh(action[0]) action[1] = sigmoid(action[1]) action[2] = clip(np.tanh(action[2])) ''' if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden) action = np.tanh(np.dot(h, self.weight_output) + self.bias_output) else: action = np.tanh(np.dot(h, self.weight) + self.bias) if arglist.competitive: obs, rewards, done, win = self.env.step([action[0], 'script']) else: obs, rewards, done, win = self.env.step(action) extra_reward = 0.0 # penalize for turning too frequently if arglist.competitive: if arglist.train_mode and penalize_turning: extra_reward -= np.abs(action[0]) / 10.0 rewards[0] += extra_reward reward = rewards[0] else: if arglist.train_mode and penalize_turning: reward = np.sum(rewards) extra_reward -= np.abs(action[0]) / 10.0 reward += extra_reward # recording_reward.append(reward) # total_reward += reward self.state = rnn_next_state(self.rnn, z, action, self.state) return action def set_model_params(self, model_params): if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer params = np.array(model_params) cut_off = (self.input_size + 1) * self.hidden_size params_1 = params[:cut_off] params_2 = params[cut_off:] self.bias_hidden = params_1[:self.hidden_size] self.weight_hidden = params_1[self.hidden_size:].reshape( self.input_size, self.hidden_size) self.bias_output = params_2[:2] self.weight_output = params_2[2:].reshape(self.hidden_size, 2) else: self.bias = np.array(model_params[:2]) self.weight = np.array(model_params[2:]).reshape( self.input_size, 2) def load_model(self, filename): with open(filename) as f: data = json.load(f) print('loading file %s' % (filename)) self.data = data model_params = np.array(data[0]) # assuming other stuff is in data self.set_model_params(model_params) def get_random_model_params(self, stdev=0.1): return np.random.randn(self.param_count) * stdev
class VAERacing(CarRacing): def __init__(self, full_episode=False): super(VAERacing, self).__init__() self._internal_counter = 0 self.z_size = games['vae_racing'].input_size #print("vae_racing.py z", self.z_size) self.vae = ConvVAE(batch_size=1, z_size=self.z_size, gpu_mode=True, is_training=False, reuse=True) #print("vae_racing.py vae", self.vae) self.vae.load_json('vae/vae_'+str(self.z_size)+'.json') self.full_episode = full_episode high = np.array([np.inf] * self.z_size) self.observation_space = Box(-high, high) self._has_rendered = False self.real_frame = None def reset(self): self._internal_counter = 0 self._has_rendered = False self.real_frame = None obs = super(VAERacing, self).reset() result = np.copy(_process_frame(obs)).astype(np.float)/255.0 result = result.reshape(1, 64, 64, 3) self.real_frame = result mu, logvar = self.vae.encode_mu_logvar(result) mu = mu[0] logvar = logvar[0] s = logvar.shape z = mu + np.exp(logvar/2.0) * np.random.randn(*s) if MU_MODE: return mu return z def render(self, mode='human', close=False): if mode == 'human' or mode == 'rgb_array': self._has_rendered = True return super(VAERacing, self).render(mode=mode) def step(self, action): #print("action", action) if not self._has_rendered: self.render("rgb_array") self._has_rendered = False if action is not None: action[0] = _clip(action[0], lo=-1.0, hi=+1.0) action[1] = _clip(action[1], lo=-1.0, hi=+1.0) action[1] = (action[1]+1.0) / 2.0 action[2] = _clip(action[2]) obs, reward, done, _ = super(VAERacing, self).step(action) result = np.copy(_process_frame(obs)).astype(np.float)/255.0 result = result.reshape(1, 64, 64, 3) self.real_frame = result #z = self.vae.encode(result).flatten() mu, logvar = self.vae.encode_mu_logvar(result) mu = mu[0] logvar = logvar[0] s = logvar.shape z = mu + np.exp(logvar/2.0) * np.random.randn(*s) if self.full_episode: if MU_MODE: return mu, reward, False, {} else: return z, reward, False, {} self._internal_counter += 1 if self._internal_counter > TIME_LIMIT: done = True if MU_MODE: #print("mu", mu) return mu, reward, done, {} return z, reward, done, {}
class Model: def __init__(self,load_model=True): self.env_name="carracing" self.vae=ConvVAE(batch_size=1,gpu_mode=False,is_training=False,reuse=True) self.rnn=MDNRNN(hps_sample,gpu_mode=False,reuse=True) if load_model: self.vae.load_json('vae/vae.json') self.rnn.load_json('rnn/rnn.json') self.state=rnn_init_state(self.rnn) self.rnn_mode=True self.input_size=rnn_output_size(EXP_MODE) self.z_size=32 if EXP_MODE==MODE_Z_HIDDEN: self.hidden_size=40 self.weight_hidden=np.random.randn(self.input_size,self.hidden_size) self.bias_hidden=np.random.randn(self.hidden_size) self.weight_output=np.random.randn(self.hidden_size,3) self.bias_output=np.random.randn(3) self.param_count=((self.input_size+1)*self.hidden_size)+(self.hidden_size*3+3) else: self.weight=np.random.randn(self.input_size,3) self.bias=np.random.randn(3) self.param_count=(self.input_size)*3+3 self.render_mode=False def make_env(self,seed=-1,render_mode=False,full_episode=False): self.render_mode=render_mode self.env=make_env(self.env_name,seed=seed,render_mode=render_mode,full_episode=full_episode) def reset(self): self.state=rnn_init_state(self.rnn) def encode_obs(self,obs): result=np.copy(obs).astype(np.float)/255.0 result=result.reshape(1,64,64,3) mu,logvar=self.vae.encode_mu_logvar(result) mu=mu[0] logvar=logvar[0] s=logvar.shape z=mu+np.exp(logvar/2.0)*np.random.randn(*s) return z,mu,logvar def get_action(self,z): h=rnn_output(self.state,z,EXP_MODE) if EXP_MODE==MODE_Z_HIDDEN: h=np.tanh(np.dot(h,self.weight_hidden)+self.bias_hidden) action=np.tanh(np.dot(h,self.weight_output)+self.bias_output) else: action=np.tanh(np.dot(h,self.weight)+self.bias) action[1]=(action[1]+1.0)/2.0 action[2]=clip(action[2]) self.state=rnn_next_state(self.rnn,z,action,self.state) return action def set_model_params(self,model_params): if EXP_MODE==MODE_Z_HIDDEN: params=np.array(model_params) cut_off=(self.input_size+1)*self.hidden_size params_1=params[:cut_off] params_2=params[cut_off:] self.bias_hidden=params_1[:self.hidden_size] self.weight_hidden=params_1[self.hidden_size:].reshape(self.input_size,self.hidden_size) self.bias_output=params_2[:3] self.weight_output=params_2[3:].reshape(self.hidden_size,3) else: self.bias=np.array(model_params[:3]) self.weight=np.array(model_params[3:]).reshape(self.input_size,3) def load_model(self,filename): with open(filename)as f: data=json.load(f) print('loading file %s'%(filename)) self.data=data model_params=np.array(data[0]) self.set_model_params(model_params) def get_random_model_params(self,stdev=0.1): return np.random.standard_cauchy(self.param_count)*stdev def init_random_model_params(self,stdev=0.1): params=self.get_random_model_params(stdev=stdev) self.set_model_params(params) vae_params=self.vae.get_random_model_params(stdev=stdev) self.vae.set_model_params(vae_params) rnn_params=self.rnn.get_random_model_params(stdev=stdev) self.rnn.set_model_params(rnn_params)