def init(self, sess):
     self.env_learner = EnvLearner(SimpleArmRaw(train=True))
     self.env_learner.initialize(sess)
     train, valid = self.__gen_train_data__()
     print('Data Gathered')
     self.__train_self_model__(train, valid)
     print('Model Trained')
     self.inited = True
예제 #2
0
 def __init__(self, env_in):
     EnvLearner.__init__(self, env_in)
     self.buff_len = 10
     self.env = EnvLearnerEnv(env_in, self.buff_len)
     self.buffer = deque(self.buff_init * self.buff_len, maxlen=self.buff_len)
예제 #3
0
    def __init__(self, env_in):
        EnvLearner.__init__(self, env_in)
        # Initialization
        self.buff_len = 10
        self.seq_len = 1
        self.max_seq_len = 5
        self.last_r = np.array([0.0]).flatten()
        self.buffer = deque(self.buff_init * self.buff_len, maxlen=self.buff_len)
        dropout_rate = 0.5
        self.lr_disc = 1e-5
        self.lr_gen = 1e-5
        print('General Stats: ')
        print('Drop Rate: ' + str(dropout_rate))
        print('Buffer Len: ' + str(self.buff_len))
        print('Start Sequence Len: ' + str(self.seq_len))
        print('End Sequence Len: ' + str(self.max_seq_len))
        print('gan_model:')
        print('Learning Rate: ' + str(self.lr_disc))
        print('Learning Rate: ' + str(self.lr_gen))

        """ State Prediction """
        self.x_seq = tf.placeholder(dtype=tf.float32, shape=([None, self.buff_init[0].size * self.buff_len]))
        self.y_seq = tf.placeholder(dtype=tf.float32, shape=([None, self.state_dim * self.max_seq_len]))
        self.a_seq = tf.placeholder(dtype=tf.float32, shape=([None, self.act_dim * self.max_seq_len]))

        a_seq_split = tf.split(self.a_seq, self.max_seq_len, 1)
        y_seq_split = tf.split(self.y_seq, self.max_seq_len, 1)

        input_tmp_seq = tf.split(self.x_seq, self.buff_len, 1)
        self.out_state_raw = models.generator_model(input_tmp_seq, self.state_dim, drop_rate=dropout_rate)

        self.out_state = self.out_state_raw*self.state_mul_const
        self.loss_seq = 0.0
        self.loss_last = 0.0
        out_states = []
        out_states.append(self.out_state_raw)
        self.loss_seq += losses.loss_p(out_states[-1], y_seq_split[0])
        self.loss_last += losses.loss_p(out_states[-1], tf.slice(input_tmp_seq[-1], [0, 0], [-1, self.state_dim]))
        for i in range(1, self.seq_len):
            state_tmp = tf.slice(self.x_seq[:],
                                   [0, self.buff_init[0].size],
                                   [-1, -1]
                                   )
            state_tmp = tf.concat([state_tmp, out_states[-1]], axis=1)
            input_tmp = tf.concat([state_tmp, a_seq_split[i]], axis=1)

            input_tmp_seq = tf.split(input_tmp, self.buff_len, 1)
            out_state_raw_tmp = models.generator_model(input_tmp_seq, self.state_dim, drop_rate=dropout_rate)
            out_states.append(out_state_raw_tmp)
            self.loss_seq += losses.loss_p(out_states[-1], y_seq_split[i])
            self.loss_last += losses.loss_p(out_states[-1], out_states[-2])

        self.out_state_seq = tf.concat(out_states, axis=1)

        self.loss_state = self.loss_seq

        self.train_step_state = tf.train.AdamOptimizer(self.lr_gen).minimize(self.loss_state)

        """ GAN Stuff """
        x_seq = []
        g_seq = []
        out_seq_split = tf.split(self.out_state_seq, self.seq_len, 1)
        for i in range(self.seq_len):
            x_seq.append(tf.concat([y_seq_split[i], a_seq_split[i]], axis=1))
            g_seq.append(tf.concat([out_seq_split[i], a_seq_split[i]], axis=1))

        x_in = x_seq
        g_in = g_seq
        self.Dx = models.discriminator_model(x_in, drop_rate=dropout_rate)
        self.Dg = models.discriminator_model(g_in, drop_rate=dropout_rate)
        var_d = tf.trainable_variables('discriminator')
        var_g = tf.trainable_variables('generator')
        self.g_lambda = 1.0
        self.p_lambda = 0.0
        self.t_lambda = 0.0

        """ Vanilla GAN """
        # self.n_d = 1
        # self.disc_loss = -tf.reduce_mean(tf.log(self.Dx) + tf.log(1-self.Dg))
        # self.g_loss = -tf.reduce_mean(tf.log(self.Dg))
        # self.gen_loss =  g_lambda*self.g_loss + p_lambda * self.loss_seq
        # self.train_step_disc = tf.train.AdamOptimizer(lr_disc).minimize(self.disc_loss, var_list=var_d)
        # self.train_step_gen = tf.train.AdamOptimizer(lr_gen).minimize(self.gen_loss, var_list=var_g)

        """ WGAN-GP """
        self.n_d = 5
        self.epsilon = 0.01
        self.gp_lambda = 10

        self.disc_loss = tf.reduce_mean(self.Dg) - tf.reduce_mean(self.Dx)
        self.g_loss = -tf.reduce_mean(self.Dg)
        self.gen_loss =  self.g_lambda*self.g_loss + \
                         self.p_lambda * self.loss_seq + \
                         self.t_lambda * self.loss_last
        x_hat = self.epsilon*self.Dx + (1-self.epsilon)*self.Dg
        grad_list = tf.gradients(x_hat, var_d)[2:]
        GP = 0.0
        for layer in grad_list:
            GP += self.gp_lambda * (tf.sqrt(tf.reduce_sum(tf.square(layer))) - 1) ** 2
        self.disc_loss += GP
        self.train_step_disc = tf.train.AdamOptimizer(self.lr_disc, beta1=0, beta2=0.9).minimize(self.disc_loss, var_list=var_d)
        self.train_step_gen = tf.train.AdamOptimizer(self.lr_gen, beta1=0, beta2=0.9).minimize(self.gen_loss, var_list=var_g)
예제 #4
0
    def __init__(self, env_in):
        EnvLearner.__init__(self, env_in)
        # Initialization
        self.buff_len = 10
        self.seq_len = 5
        self.max_seq_len = 5
        self.last_r = np.array([0.0]).flatten()
        self.buffer = deque(self.buff_init * self.buff_len, maxlen=self.buff_len)
        dropout_rate = 0.5
        self.lr_disc = 1e-5
        self.lr_gen = 1e-5
        print('General Stats: ')
        print('Drop Rate: ' + str(dropout_rate))
        print('Buffer Len: ' + str(self.buff_len))
        print('Start Sequence Len: ' + str(self.seq_len))
        print('End Sequence Len: ' + str(self.max_seq_len))
        print('dnn_model:')
        print('Learning Rate: ' + str(self.lr_disc))
        print('Learning Rate: ' + str(self.lr_gen))

        discount = 1

        """ State Prediction """
        self.x_seq = tf.placeholder(dtype=tf.float32, shape=([None, self.buff_init[0].size * self.buff_len]))
        self.y_seq = tf.placeholder(dtype=tf.float32, shape=([None, self.state_dim * self.max_seq_len]))
        self.a_seq = tf.placeholder(dtype=tf.float32, shape=([None, self.act_dim * self.max_seq_len]))

        a_seq_split = tf.split(self.a_seq, self.max_seq_len, 1)
        y_seq_split = tf.split(self.y_seq, self.max_seq_len, 1)

        input_tmp_seq = tf.split(self.x_seq, self.buff_len, 1)
        self.out_state_raw = models.generator_model(input_tmp_seq, self.state_dim, drop_rate=dropout_rate)

        self.out_state = self.out_state_raw*self.state_mul_const
        self.loss_seq = 0.0
        self.loss_last = 0.0
        out_states = []
        out_states.append(self.out_state_raw)
        self.loss_seq += losses.loss_p(out_states[-1], y_seq_split[0])
        self.loss_last += losses.loss_p(out_states[-1], tf.slice(input_tmp_seq[-1], [0, 0], [-1, self.state_dim]))
        for i in range(1, self.seq_len):
            state_tmp = tf.slice(self.x_seq[:],
                                   [0, self.buff_init[0].size],
                                   [-1, -1]
                                   )
            state_tmp = tf.concat([state_tmp, out_states[-1]], axis=1)
            input_tmp = tf.concat([state_tmp, a_seq_split[i]], axis=1)

            input_tmp_seq = tf.split(input_tmp, self.buff_len, 1)
            out_state_raw_tmp = models.generator_model(input_tmp_seq, self.state_dim, drop_rate=dropout_rate)
            out_states.append(out_state_raw_tmp)
            self.loss_seq += (discount**(i-1))*losses.loss_p(out_states[-1], y_seq_split[i])
            self.loss_last += losses.loss_p(out_states[-1], out_states[-2])

        self.out_state_seq = tf.concat(out_states, axis=1)

        self.loss_state = self.loss_seq

        self.train_step_state = tf.train.AdamOptimizer(self.lr_gen).minimize(self.loss_state)

        self.loss =  self.loss_seq
        self.train_step = tf.train.AdamOptimizer(self.lr_gen, beta1=0, beta2=0.9).minimize(self.loss)
예제 #5
0
 def __init__(self, env_in):
     EnvLearner.__init__(self, env_in)
     self.buff_init = [np.zeros(self.state_dim + self.act_dim)]
class SimpleArm(gym.Env):
    def __init__(self):
        self.r = SimpleArmRaw().r
        self.max_iter = SimpleArmRaw().max_iter
        self.t = 0

        self.action_space = SimpleArmRaw().action_space

        self.observation_space = SimpleArmRaw().observation_space

        self.obs = self.reset()
        self.inited = False

    def init(self, sess):
        self.env_learner = EnvLearner(SimpleArmRaw(train=True))
        self.env_learner.initialize(sess)
        train, valid = self.__gen_train_data__()
        print('Data Gathered')
        self.__train_self_model__(train, valid)
        print('Model Trained')
        self.inited = True

    def __gen_train_data__(self):
        env = SimpleArmRaw(train=True)
        train_episodes = 100
        nb_valid_episodes = 50
        episode_duration = -1
        max_action = env.action_space.high
        episode_step = 0.0
        episode_reward = 0.0
        max_ep_rew = -1000

        train = []
        valid = []
        obs = env.reset()
        i = 0
        while i < train_episodes:
            action = np.random.uniform(-1, 1, env.action_space.shape[0])
            # action = find_next_move(env, self.env_learner, obs, max_action, episode_step)
            new_obs, r, done, info = env.step(max_action * action)
            if episode_duration > 0:
                done = (done or (episode_step >= episode_duration))
            train.append(
                [obs, max_action * action, r, new_obs, done, episode_step])
            episode_step += 1
            obs = new_obs

            episode_reward += r
            if done:
                episode_step = 0.0
                obs = env.reset()
                max_ep_rew = max(max_ep_rew, episode_reward)
                episode_reward = 0.0
                i += 1

        i = 0
        while i < nb_valid_episodes:
            action = np.random.uniform(-1, 1, env.action_space.shape[0])
            # action = find_next_move(env, self.env_learner, obs, max_action, episode_step)
            new_obs, r, done, info = env.step(max_action * action)
            if episode_duration > 0:
                done = (done or (episode_step >= episode_duration))
            valid.append(
                [obs, max_action * action, r, new_obs, done, episode_step])
            episode_step += 1
            obs = new_obs

            episode_reward += r
            if done:
                obs = env.reset()
                max_ep_rew = max(max_ep_rew, episode_reward)
                episode_reward = 0.0
                i += 1
        return train, valid

    def __train_self_model__(self, train, valid):
        total_steps = 50
        log_interval = 10
        import time

        min_loss = 10000000000
        stop_count = 0
        for i in range(total_steps):
            if i > 0 and i % (
                    total_steps / self.env_learner.max_seq_len
            ) == 0 and self.env_learner.seq_len < self.env_learner.max_seq_len:
                self.env_learner.seq_len += 1
                print('Sequence Length: ' + str(self.env_learner.seq_len))

            if i % log_interval == 0 and valid is not None:
                (vGen, vDisc, vC) = self.env_learner.get_loss(valid)
                print('Epoch: ' + str(i) + '/' + str(total_steps))
                print('Valid Loss')
                print('Gen:  ' + str(vGen))
                print('Disc: ' + str(vDisc))
                print('Close: ' + str(vC))
                print()
                # if saver is not None and save_str is not None:
                #     save_path = saver.save(self.env_learner.sess, 'models/' + str(save_str) + '.ckpt')
                #     print("Model saved in path: %s" % save_path)
            start = time.time()
            tlGen, tlDisc = self.env_learner.train_adv(train)
            duration = time.time() - start
            if tlGen < min_loss:
                min_loss = tlGen
                stop_count = 0
            else:
                stop_count += 1
            if i % log_interval != 0:
                print('Epoch: ' + str(i) + '/' + str(total_steps) + ' in ' +
                      str(duration) + 's')
                print('Train Loss')
                print('Gen:  ' + str(tlGen))
                print('Disc: ' + str(tlDisc))
                print()
        if valid is not None:
            (vGen, vDisc, vC) = self.env_learner.get_loss(valid)
            print('Final Epoch: ')
            print('Valid Loss')
            print('Gen:  ' + str(vGen))
            print('Disc: ' + str(vDisc))
            print('Close: ' + str(vC))
            print()
        # if saver is not None and save_str is not None:
        #     save_path = saver.save(self.env_learner.sess, 'models/' + str(save_str) + '.ckpt')
        #     print("Final Model saved in path: %s" % save_path)

    #
    # def __get_obs__(self):
    #     return np.concatenate([self.x, self.y, np.array([self.d])], axis=0)

    def __get_obs__(self):
        elbows = []
        last_ver = 0.0
        last_hor = 0.0
        elbow = np.zeros(3)
        for j in range(self.r.size - 1):
            elbow[0] += float(
                self.r[j] * math.cos(last_hor + self.x[2 * j]) *
                math.sin(math.pi / 2 - last_ver - self.x[2 * j + 1]))
            elbow[1] += float(
                self.r[j] * math.sin(last_hor + self.x[2 * j]) *
                math.sin(math.pi / 2 - last_ver - self.x[2 * j + 1]))
            elbow[2] += float(
                self.r[j] *
                math.cos(math.pi / 2 - last_ver - self.x[2 * j + 1]))
            elbows.append(elbow)
        elbows = np.concatenate(elbows)
        return np.concatenate([self.x, elbows, self.y,
                               np.array([self.d])],
                              axis=0)

    def __get_pos__(self, x):
        y = np.zeros(3)
        last_ver = 0.0
        last_hor = 0.0
        for j in range(self.r.size):
            y[0] += float(self.r[j] * math.cos(last_hor + x[2 * j]) *
                          math.sin(math.pi / 2 - last_ver - x[2 * j + 1]))
            y[1] += float(self.r[j] * math.sin(last_hor + x[2 * j]) *
                          math.sin(math.pi / 2 - last_ver - x[2 * j + 1]))
            y[2] += float(self.r[j] *
                          math.cos(math.pi / 2 - last_ver - x[2 * j + 1]))
            last_hor += x[2 * j]
            last_ver += x[2 * j + 1]
        return y

    def reset(self):
        self.t = 0
        np.random.seed()
        self.x = np.random.uniform(-math.pi, math.pi, 2 * self.r.size)
        self.y = self.__get_pos__(self.x)
        np.random.seed()
        tmp = np.random.uniform(-math.pi, math.pi, 2 * self.r.size)
        self.target = self.__get_pos__(tmp)
        # print(self.target)
        self.iteration = 0
        self.d = np.linalg.norm(self.y - self.target)
        self.state = self.__get_obs__()
        return self.state

    def step(self, action):
        new_obs = self.env_learner.step(self.obs[:-1], action, self.t)
        self.t += 1
        d = np.linalg.norm(self.target - new_obs[-3:])
        if self.t == 1:
            self.rew = -d
        else:
            self.rew = self.d - d
        self.d = d
        self.obs = np.concatenate([new_obs, np.array([self.d])])
        self.done = (self.t >= self.max_iter)
        return self.obs, self.rew, self.done, {}
 def __init__(self, env_in):
     EnvLearner.__init__(self, env_in)
     self.knn = neighbors.KNeighborsRegressor(5, weights='distance')
예제 #8
0
    def __init__(self, env_in):
        EnvLearner.__init__(self, env_in)
        # from baselines.ddpg.models import Actor, Critic
        # Parse noise_type
        action_noise = None
        param_noise = None
        noise_type = 'adaptive-param_0.2'
        layer_norm = True
        nb_actions = self.state_dim
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

        # Configure components.

        self.buff_len = 10
        self.buffer = deque(self.buff_init * self.buff_len,
                            maxlen=self.buff_len)
        obs_space = (self.buff_init[0].size * self.buff_len, )
        self.memory = Memory(limit=int(1e6),
                             action_shape=env_in.observation_space.shape,
                             observation_shape=obs_space)
        self.critic = models.Critic(layer_norm=layer_norm)
        self.actor = models.Actor(nb_actions, layer_norm=layer_norm)

        self.agent = DDPG(self.actor,
                          self.critic,
                          self.memory,
                          obs_space,
                          env_in.observation_space.shape,
                          gamma=0.99,
                          tau=0.01,
                          normalize_returns=False,
                          normalize_observations=True,
                          batch_size=64,
                          action_noise=action_noise,
                          param_noise=param_noise,
                          critic_l2_reg=1e-2,
                          actor_lr=1e-5,
                          critic_lr=1e-5,
                          enable_popart=False,
                          clip_norm=None,
                          reward_scale=1.)