def init(self, sess): self.env_learner = EnvLearner(SimpleArmRaw(train=True)) self.env_learner.initialize(sess) train, valid = self.__gen_train_data__() print('Data Gathered') self.__train_self_model__(train, valid) print('Model Trained') self.inited = True
def __init__(self, env_in): EnvLearner.__init__(self, env_in) self.buff_len = 10 self.env = EnvLearnerEnv(env_in, self.buff_len) self.buffer = deque(self.buff_init * self.buff_len, maxlen=self.buff_len)
def __init__(self, env_in): EnvLearner.__init__(self, env_in) # Initialization self.buff_len = 10 self.seq_len = 1 self.max_seq_len = 5 self.last_r = np.array([0.0]).flatten() self.buffer = deque(self.buff_init * self.buff_len, maxlen=self.buff_len) dropout_rate = 0.5 self.lr_disc = 1e-5 self.lr_gen = 1e-5 print('General Stats: ') print('Drop Rate: ' + str(dropout_rate)) print('Buffer Len: ' + str(self.buff_len)) print('Start Sequence Len: ' + str(self.seq_len)) print('End Sequence Len: ' + str(self.max_seq_len)) print('gan_model:') print('Learning Rate: ' + str(self.lr_disc)) print('Learning Rate: ' + str(self.lr_gen)) """ State Prediction """ self.x_seq = tf.placeholder(dtype=tf.float32, shape=([None, self.buff_init[0].size * self.buff_len])) self.y_seq = tf.placeholder(dtype=tf.float32, shape=([None, self.state_dim * self.max_seq_len])) self.a_seq = tf.placeholder(dtype=tf.float32, shape=([None, self.act_dim * self.max_seq_len])) a_seq_split = tf.split(self.a_seq, self.max_seq_len, 1) y_seq_split = tf.split(self.y_seq, self.max_seq_len, 1) input_tmp_seq = tf.split(self.x_seq, self.buff_len, 1) self.out_state_raw = models.generator_model(input_tmp_seq, self.state_dim, drop_rate=dropout_rate) self.out_state = self.out_state_raw*self.state_mul_const self.loss_seq = 0.0 self.loss_last = 0.0 out_states = [] out_states.append(self.out_state_raw) self.loss_seq += losses.loss_p(out_states[-1], y_seq_split[0]) self.loss_last += losses.loss_p(out_states[-1], tf.slice(input_tmp_seq[-1], [0, 0], [-1, self.state_dim])) for i in range(1, self.seq_len): state_tmp = tf.slice(self.x_seq[:], [0, self.buff_init[0].size], [-1, -1] ) state_tmp = tf.concat([state_tmp, out_states[-1]], axis=1) input_tmp = tf.concat([state_tmp, a_seq_split[i]], axis=1) input_tmp_seq = tf.split(input_tmp, self.buff_len, 1) out_state_raw_tmp = models.generator_model(input_tmp_seq, self.state_dim, drop_rate=dropout_rate) out_states.append(out_state_raw_tmp) self.loss_seq += losses.loss_p(out_states[-1], y_seq_split[i]) self.loss_last += losses.loss_p(out_states[-1], out_states[-2]) self.out_state_seq = tf.concat(out_states, axis=1) self.loss_state = self.loss_seq self.train_step_state = tf.train.AdamOptimizer(self.lr_gen).minimize(self.loss_state) """ GAN Stuff """ x_seq = [] g_seq = [] out_seq_split = tf.split(self.out_state_seq, self.seq_len, 1) for i in range(self.seq_len): x_seq.append(tf.concat([y_seq_split[i], a_seq_split[i]], axis=1)) g_seq.append(tf.concat([out_seq_split[i], a_seq_split[i]], axis=1)) x_in = x_seq g_in = g_seq self.Dx = models.discriminator_model(x_in, drop_rate=dropout_rate) self.Dg = models.discriminator_model(g_in, drop_rate=dropout_rate) var_d = tf.trainable_variables('discriminator') var_g = tf.trainable_variables('generator') self.g_lambda = 1.0 self.p_lambda = 0.0 self.t_lambda = 0.0 """ Vanilla GAN """ # self.n_d = 1 # self.disc_loss = -tf.reduce_mean(tf.log(self.Dx) + tf.log(1-self.Dg)) # self.g_loss = -tf.reduce_mean(tf.log(self.Dg)) # self.gen_loss = g_lambda*self.g_loss + p_lambda * self.loss_seq # self.train_step_disc = tf.train.AdamOptimizer(lr_disc).minimize(self.disc_loss, var_list=var_d) # self.train_step_gen = tf.train.AdamOptimizer(lr_gen).minimize(self.gen_loss, var_list=var_g) """ WGAN-GP """ self.n_d = 5 self.epsilon = 0.01 self.gp_lambda = 10 self.disc_loss = tf.reduce_mean(self.Dg) - tf.reduce_mean(self.Dx) self.g_loss = -tf.reduce_mean(self.Dg) self.gen_loss = self.g_lambda*self.g_loss + \ self.p_lambda * self.loss_seq + \ self.t_lambda * self.loss_last x_hat = self.epsilon*self.Dx + (1-self.epsilon)*self.Dg grad_list = tf.gradients(x_hat, var_d)[2:] GP = 0.0 for layer in grad_list: GP += self.gp_lambda * (tf.sqrt(tf.reduce_sum(tf.square(layer))) - 1) ** 2 self.disc_loss += GP self.train_step_disc = tf.train.AdamOptimizer(self.lr_disc, beta1=0, beta2=0.9).minimize(self.disc_loss, var_list=var_d) self.train_step_gen = tf.train.AdamOptimizer(self.lr_gen, beta1=0, beta2=0.9).minimize(self.gen_loss, var_list=var_g)
def __init__(self, env_in): EnvLearner.__init__(self, env_in) # Initialization self.buff_len = 10 self.seq_len = 5 self.max_seq_len = 5 self.last_r = np.array([0.0]).flatten() self.buffer = deque(self.buff_init * self.buff_len, maxlen=self.buff_len) dropout_rate = 0.5 self.lr_disc = 1e-5 self.lr_gen = 1e-5 print('General Stats: ') print('Drop Rate: ' + str(dropout_rate)) print('Buffer Len: ' + str(self.buff_len)) print('Start Sequence Len: ' + str(self.seq_len)) print('End Sequence Len: ' + str(self.max_seq_len)) print('dnn_model:') print('Learning Rate: ' + str(self.lr_disc)) print('Learning Rate: ' + str(self.lr_gen)) discount = 1 """ State Prediction """ self.x_seq = tf.placeholder(dtype=tf.float32, shape=([None, self.buff_init[0].size * self.buff_len])) self.y_seq = tf.placeholder(dtype=tf.float32, shape=([None, self.state_dim * self.max_seq_len])) self.a_seq = tf.placeholder(dtype=tf.float32, shape=([None, self.act_dim * self.max_seq_len])) a_seq_split = tf.split(self.a_seq, self.max_seq_len, 1) y_seq_split = tf.split(self.y_seq, self.max_seq_len, 1) input_tmp_seq = tf.split(self.x_seq, self.buff_len, 1) self.out_state_raw = models.generator_model(input_tmp_seq, self.state_dim, drop_rate=dropout_rate) self.out_state = self.out_state_raw*self.state_mul_const self.loss_seq = 0.0 self.loss_last = 0.0 out_states = [] out_states.append(self.out_state_raw) self.loss_seq += losses.loss_p(out_states[-1], y_seq_split[0]) self.loss_last += losses.loss_p(out_states[-1], tf.slice(input_tmp_seq[-1], [0, 0], [-1, self.state_dim])) for i in range(1, self.seq_len): state_tmp = tf.slice(self.x_seq[:], [0, self.buff_init[0].size], [-1, -1] ) state_tmp = tf.concat([state_tmp, out_states[-1]], axis=1) input_tmp = tf.concat([state_tmp, a_seq_split[i]], axis=1) input_tmp_seq = tf.split(input_tmp, self.buff_len, 1) out_state_raw_tmp = models.generator_model(input_tmp_seq, self.state_dim, drop_rate=dropout_rate) out_states.append(out_state_raw_tmp) self.loss_seq += (discount**(i-1))*losses.loss_p(out_states[-1], y_seq_split[i]) self.loss_last += losses.loss_p(out_states[-1], out_states[-2]) self.out_state_seq = tf.concat(out_states, axis=1) self.loss_state = self.loss_seq self.train_step_state = tf.train.AdamOptimizer(self.lr_gen).minimize(self.loss_state) self.loss = self.loss_seq self.train_step = tf.train.AdamOptimizer(self.lr_gen, beta1=0, beta2=0.9).minimize(self.loss)
def __init__(self, env_in): EnvLearner.__init__(self, env_in) self.buff_init = [np.zeros(self.state_dim + self.act_dim)]
class SimpleArm(gym.Env): def __init__(self): self.r = SimpleArmRaw().r self.max_iter = SimpleArmRaw().max_iter self.t = 0 self.action_space = SimpleArmRaw().action_space self.observation_space = SimpleArmRaw().observation_space self.obs = self.reset() self.inited = False def init(self, sess): self.env_learner = EnvLearner(SimpleArmRaw(train=True)) self.env_learner.initialize(sess) train, valid = self.__gen_train_data__() print('Data Gathered') self.__train_self_model__(train, valid) print('Model Trained') self.inited = True def __gen_train_data__(self): env = SimpleArmRaw(train=True) train_episodes = 100 nb_valid_episodes = 50 episode_duration = -1 max_action = env.action_space.high episode_step = 0.0 episode_reward = 0.0 max_ep_rew = -1000 train = [] valid = [] obs = env.reset() i = 0 while i < train_episodes: action = np.random.uniform(-1, 1, env.action_space.shape[0]) # action = find_next_move(env, self.env_learner, obs, max_action, episode_step) new_obs, r, done, info = env.step(max_action * action) if episode_duration > 0: done = (done or (episode_step >= episode_duration)) train.append( [obs, max_action * action, r, new_obs, done, episode_step]) episode_step += 1 obs = new_obs episode_reward += r if done: episode_step = 0.0 obs = env.reset() max_ep_rew = max(max_ep_rew, episode_reward) episode_reward = 0.0 i += 1 i = 0 while i < nb_valid_episodes: action = np.random.uniform(-1, 1, env.action_space.shape[0]) # action = find_next_move(env, self.env_learner, obs, max_action, episode_step) new_obs, r, done, info = env.step(max_action * action) if episode_duration > 0: done = (done or (episode_step >= episode_duration)) valid.append( [obs, max_action * action, r, new_obs, done, episode_step]) episode_step += 1 obs = new_obs episode_reward += r if done: obs = env.reset() max_ep_rew = max(max_ep_rew, episode_reward) episode_reward = 0.0 i += 1 return train, valid def __train_self_model__(self, train, valid): total_steps = 50 log_interval = 10 import time min_loss = 10000000000 stop_count = 0 for i in range(total_steps): if i > 0 and i % ( total_steps / self.env_learner.max_seq_len ) == 0 and self.env_learner.seq_len < self.env_learner.max_seq_len: self.env_learner.seq_len += 1 print('Sequence Length: ' + str(self.env_learner.seq_len)) if i % log_interval == 0 and valid is not None: (vGen, vDisc, vC) = self.env_learner.get_loss(valid) print('Epoch: ' + str(i) + '/' + str(total_steps)) print('Valid Loss') print('Gen: ' + str(vGen)) print('Disc: ' + str(vDisc)) print('Close: ' + str(vC)) print() # if saver is not None and save_str is not None: # save_path = saver.save(self.env_learner.sess, 'models/' + str(save_str) + '.ckpt') # print("Model saved in path: %s" % save_path) start = time.time() tlGen, tlDisc = self.env_learner.train_adv(train) duration = time.time() - start if tlGen < min_loss: min_loss = tlGen stop_count = 0 else: stop_count += 1 if i % log_interval != 0: print('Epoch: ' + str(i) + '/' + str(total_steps) + ' in ' + str(duration) + 's') print('Train Loss') print('Gen: ' + str(tlGen)) print('Disc: ' + str(tlDisc)) print() if valid is not None: (vGen, vDisc, vC) = self.env_learner.get_loss(valid) print('Final Epoch: ') print('Valid Loss') print('Gen: ' + str(vGen)) print('Disc: ' + str(vDisc)) print('Close: ' + str(vC)) print() # if saver is not None and save_str is not None: # save_path = saver.save(self.env_learner.sess, 'models/' + str(save_str) + '.ckpt') # print("Final Model saved in path: %s" % save_path) # # def __get_obs__(self): # return np.concatenate([self.x, self.y, np.array([self.d])], axis=0) def __get_obs__(self): elbows = [] last_ver = 0.0 last_hor = 0.0 elbow = np.zeros(3) for j in range(self.r.size - 1): elbow[0] += float( self.r[j] * math.cos(last_hor + self.x[2 * j]) * math.sin(math.pi / 2 - last_ver - self.x[2 * j + 1])) elbow[1] += float( self.r[j] * math.sin(last_hor + self.x[2 * j]) * math.sin(math.pi / 2 - last_ver - self.x[2 * j + 1])) elbow[2] += float( self.r[j] * math.cos(math.pi / 2 - last_ver - self.x[2 * j + 1])) elbows.append(elbow) elbows = np.concatenate(elbows) return np.concatenate([self.x, elbows, self.y, np.array([self.d])], axis=0) def __get_pos__(self, x): y = np.zeros(3) last_ver = 0.0 last_hor = 0.0 for j in range(self.r.size): y[0] += float(self.r[j] * math.cos(last_hor + x[2 * j]) * math.sin(math.pi / 2 - last_ver - x[2 * j + 1])) y[1] += float(self.r[j] * math.sin(last_hor + x[2 * j]) * math.sin(math.pi / 2 - last_ver - x[2 * j + 1])) y[2] += float(self.r[j] * math.cos(math.pi / 2 - last_ver - x[2 * j + 1])) last_hor += x[2 * j] last_ver += x[2 * j + 1] return y def reset(self): self.t = 0 np.random.seed() self.x = np.random.uniform(-math.pi, math.pi, 2 * self.r.size) self.y = self.__get_pos__(self.x) np.random.seed() tmp = np.random.uniform(-math.pi, math.pi, 2 * self.r.size) self.target = self.__get_pos__(tmp) # print(self.target) self.iteration = 0 self.d = np.linalg.norm(self.y - self.target) self.state = self.__get_obs__() return self.state def step(self, action): new_obs = self.env_learner.step(self.obs[:-1], action, self.t) self.t += 1 d = np.linalg.norm(self.target - new_obs[-3:]) if self.t == 1: self.rew = -d else: self.rew = self.d - d self.d = d self.obs = np.concatenate([new_obs, np.array([self.d])]) self.done = (self.t >= self.max_iter) return self.obs, self.rew, self.done, {}
def __init__(self, env_in): EnvLearner.__init__(self, env_in) self.knn = neighbors.KNeighborsRegressor(5, weights='distance')
def __init__(self, env_in): EnvLearner.__init__(self, env_in) # from baselines.ddpg.models import Actor, Critic # Parse noise_type action_noise = None param_noise = None noise_type = 'adaptive-param_0.2' layer_norm = True nb_actions = self.state_dim for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. self.buff_len = 10 self.buffer = deque(self.buff_init * self.buff_len, maxlen=self.buff_len) obs_space = (self.buff_init[0].size * self.buff_len, ) self.memory = Memory(limit=int(1e6), action_shape=env_in.observation_space.shape, observation_shape=obs_space) self.critic = models.Critic(layer_norm=layer_norm) self.actor = models.Actor(nb_actions, layer_norm=layer_norm) self.agent = DDPG(self.actor, self.critic, self.memory, obs_space, env_in.observation_space.shape, gamma=0.99, tau=0.01, normalize_returns=False, normalize_observations=True, batch_size=64, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=1e-2, actor_lr=1e-5, critic_lr=1e-5, enable_popart=False, clip_norm=None, reward_scale=1.)