예제 #1
0
    def augment_dataset(self, traj_list, states, actions, rewards, sprimes):
        new_s, new_a, new_r, new_sprime, new_sumR, _, new_traj_lengths = format_RL_data(traj_list)
        new_a = new_a
        new_data_obtained = len(new_s) > 0

        if new_data_obtained:
            if states is not None:
                n_new = len(new_s)
                n_dim_state = states.shape[1]
                states = np.r_[states, new_s.reshape((n_new, n_dim_state))]
                actions = np.r_[actions, new_a]
                rewards = np.r_[rewards, new_r]
                sprimes = np.r_[sprimes, new_sprime.reshape((n_new, n_dim_state))]
            else:
                states = new_s
                actions = new_a
                rewards = new_r
                sprimes = new_sprime
        else:
            pass

        if states is not None:
            terminal_state_idxs = np.where(np.sum(np.sum(sprimes, axis=-1), axis=-1) == 0)[0]
            nonterminal_mask = np.ones((sprimes.shape[0], 1))
            nonterminal_mask[terminal_state_idxs, :] = 0
        else:
            nonterminal_mask = None

        return states, actions, rewards, sprimes, nonterminal_mask, new_data_obtained
예제 #2
0
    def train(self, problem, seed, epochs=500, d_lr=1e-3, g_lr=1e-4):
        K.set_value(self.opt_G.lr, g_lr)
        K.set_value(self.opt_D.lr, d_lr)

        print self.opt_G.get_config()

        pfilename = self.save_folder + '/' + str(seed) + '_performance.txt'
        pfile = open(pfilename, 'wb')

        self.n_feasible_trajs = 0
        traj_list = []
        self.pfilename = self.save_folder + '/' + str(
            seed) + '_performance.txt'
        pfile = open(self.pfilename, 'wb')
        n_data = 0
        n_remains = []
        for i in range(1, epochs):
            self.epoch = i
            print "N simulations %d/%d" % (i, epochs)
            if 'convbelt' in problem.name:
                length_of_rollout = 20
            else:
                length_of_rollout = 10

            for n_iter in range(
                    1):  # N = 5, T = 20, using the notation from PPO paper
                problem.init_saver.Restore()
                problem.objects_currently_not_in_goal = problem.objects
                traj, n_remain = problem.rollout_the_policy(
                    self, length_of_rollout)
                if len(traj['a']) > 0:
                    traj_list.append(traj)
                    n_remains.append(n_remain)

            if len(traj['a']) > 0:
                avg_J = self.log_traj_performance([traj_list[-1]],
                                                  n_remains[-1], i, n_data)
                lowest_possible_reward = -2
                if avg_J > lowest_possible_reward:
                    self.n_feasible_trajs += 1
            else:
                avg_J = self.log_traj_performance(-2.0, 7, i, n_data)

            is_time_to_train = i % 10 == 0
            if is_time_to_train and len(traj_list) > 0:
                new_s, new_a, new_r, new_sprime, new_sumR, _, new_traj_lengths = format_RL_data(
                    traj_list)
                n_data += len(new_s)
                self.update_V(new_s, new_sumR)
                new_sumA = self.compute_advantage_values(
                    new_s, new_a, new_sprime, new_r, new_traj_lengths)
                self.update_policy(new_s, new_a, new_sumA)
                traj_list = []
                n_remains = []
예제 #3
0
    def train(self,states,actions,rewards,sprimes,sumR,traj_lengths,\
              epochs=500,d_lr=1e-3,g_lr=1e-4):
        states = states.squeeze()
        sprimes = sprimes.squeeze()
        true_performance_list = []
        G_performance_list = []
        mse_list = []

        K.set_value(self.opt_G.lr, g_lr)
        K.set_value(self.opt_D.lr, d_lr)

        print self.opt_G.get_config()
        print "Fitting V..."
        current_best_J = -np.inf

        stime = time.time()

        self.update_V(states, sumR)
        adv = self.compute_A(states, actions, sprimes, rewards, traj_lengths)
        self.update_pi(states, actions, adv)

        self.saveWeights(additional_name='epoch_' + str(0))
        print time.time() - stime

        # train pi
        for i in range(1, epochs):
            stime = time.time()
            print 'Completed: %.2f%%' % (i / float(epochs) * 100)
            # Try policy - 5 trajectories, each 20 long
            traj_list = []
            for n_iter in range(
                    5):  # N = 5, T = 20, using the notation from PPO paper
                problem = ConveyorBelt()  # different "initial" state
                traj = problem.execute_policy(self,
                                              20,
                                              visualize=self.visualize)
                traj_list.append(traj)
                problem.env.Destroy()
                RaveDestroy()
            avg_J = np.mean([np.sum(traj['r']) for traj in traj_list])
            std_J = np.std([np.sum(traj['r']) for traj in traj_list])
            pfile = open(self.save_folder + '/performance.txt', 'a')
            pfile.write(str(i) + ',' + str(avg_J) + ',' + str(std_J) + '\n')
            pfile.close()
            print 'Score of this policy', avg_J
            print time.time() - stime

            # Add new data to the buffer
            new_s, new_a, new_r, new_sprime, new_sumR, _, new_traj_lengths = format_RL_data(
                traj_list)
            new_a = self.a_scaler.transform(new_a)

            self.update_V(new_s, new_sumR)
            new_sumA = self.compute_A(new_s, new_a, new_sprime, new_r,
                                      new_traj_lengths)
            self.update_pi(new_s, new_a, new_sumA)

            if avg_J > current_best_J:
                current_best_J = avg_J
                theta_star = self.save_folder + '/policy_search_' + str(
                    i) + '.h5'
                self.saveWeights(additional_name='epoch_'+\
                                str(i)+'_'+str(avg_J))
예제 #4
0
    def train(self,states,actions,rewards,sprimes,\
              epochs=500,d_lr=1e-3,g_lr=1e-4):
        states = states.squeeze()
        sprimes = sprimes.squeeze()
        true_performance_list = []
        G_performance_list = []
        mse_list = []

        n_data = states.shape[0]

        K.set_value(self.opt_G.lr, g_lr)
        K.set_value(self.opt_D.lr, d_lr)

        print self.opt_G.get_config()
        pfile = open(self.save_folder + '/performance.txt', 'w')
        pfile.close()

        current_best_J = -np.inf
        n_score_train = 1
        pfile = open(self.save_folder + '/performance.txt', 'w')
        for i in range(1, epochs):
            BATCH_SIZE = np.min([32, int(len(actions) * 0.1)])
            if BATCH_SIZE == 0:
                BATCH_SIZE = 1

            terminal_state_idxs = np.where(
                np.sum(np.sum(sprimes, axis=-1), axis=-1) == 0)[0]
            nonterminal_mask = np.ones((sprimes.shape[0], 1))
            nonterminal_mask[terminal_state_idxs, :] = 0

            stime = time.time()
            print 'Completed: %.2f%%' % (i / float(epochs) * 100)
            n_iter = len(range(0, max(actions.shape[0], n_data), BATCH_SIZE))
            n_iter = min(100, n_iter)
            print "n_iter", n_iter
            #for idx in range(0,max(actions.shape[0],n_data),BATCH_SIZE):
            for _ in range(n_iter):
                for score_train_idx in range(n_score_train):
                    # choose a batch of data - experience replay
                    indices = np.random.randint(0,
                                                actions.shape[0],
                                                size=BATCH_SIZE)
                    s_batch = np.array(states[indices, :])  # collision vector
                    a_batch = np.array(actions[indices, :])
                    r_batch = np.array(rewards[indices, :])
                    sprime_batch = np.array(sprimes[indices, :])
                    mask_batch = np.array(nonterminal_mask[
                        indices, :])  # 0 if terminal state, 1 ow

                    fake = self.a_gen.predict([sprime_batch])
                    real = a_batch

                    # make their scores
                    fake_targets = np.ones(
                        (BATCH_SIZE, 1)) * INFEASIBLE_SCORE  # marks fake data
                    real_targets = r_batch + np.multiply(
                        self.disc.predict([fake, sprime_batch]), mask_batch)
                    # Q = r(s,a)  if mask=0 if s is terminal

                    batch_x = np.vstack([fake, real])
                    batch_w = np.vstack([s_batch, s_batch])
                    batch_targets = np.vstack([fake_targets, real_targets])
                    self.disc.fit({
                        'x': batch_x,
                        'w': batch_w
                    },
                                  batch_targets,
                                  epochs=1,
                                  verbose=False)

                # train G
                y_labels = np.ones((BATCH_SIZE, ))  #dummy variable
                self.DG.fit({'w': s_batch}, {
                    'disc_output': y_labels,
                    'a_gen_output': y_labels
                },
                            epochs=1,
                            verbose=0)
            print "Training took: %.2fs" % (time.time() - stime)
            # Try policy - 5 trajectories, each 20 long
            """
      traj_list = []
      for n_iter in range(5):
        problem = ConveyorBelt() # different "initial" state 
        traj = problem.execute_policy(self,20)
        traj_list.append(traj)
        problem.env.Destroy()
        RaveDestroy()
      """
            stime = time.time()
            traj_list = self.parallel_rollout()
            print "Rollout took: %.2fs" % (time.time() - stime)
            avg_J = np.mean([np.sum(traj['r']) for traj in traj_list])
            std_J = np.std([np.sum(traj['r']) for traj in traj_list])
            pfile = open(self.save_folder + '/performance.txt', 'a')
            pfile.write(str(i) + ',' + str(avg_J) + ',' + str(std_J) + '\n')
            pfile.close()
            print 'Score of this policy', avg_J

            # Add new data to the buffer
            new_s, new_a, new_r, new_sprime, new_sumR, _, new_traj_lengths = format_RL_data(
                traj_list)
            new_a = self.a_scaler.transform(new_a)
            states = np.r_[states, new_s.squeeze()]
            actions = np.r_[actions, new_a]
            rewards = np.r_[rewards, new_r]
            sprimes = np.r_[sprimes, new_sprime.squeeze()]

            if avg_J > current_best_J:
                current_best_J = avg_J
                theta_star = self.save_folder + '/policy_search_' + str(
                    i) + '.h5'
                self.saveWeights(additional_name='lambda_'+str(LAMBDA)+'epoch_'+\
                                str(i)+'_'+str(avg_J))

            print "Epoch took: %.2fs" % (time.time() - stime)
예제 #5
0
    def train(self,states,actions,\
              epochs=500,d_lr=1e-3,g_lr=1e-4):
        states = states.squeeze()

        true_performance_list = []
        G_performance_list = []
        mse_list = []

        n_data = states.shape[0]
        BATCH_SIZE = np.min([32, int(len(actions) * 0.1)])
        if BATCH_SIZE == 0:
            BATCH_SIZE = 1
        print BATCH_SIZE

        K.set_value(self.opt_G.lr, g_lr)
        K.set_value(self.opt_D.lr, d_lr)
        print self.opt_G.get_config()

        current_best_J = -np.inf
        n_score_train = 1
        performance_list = []
        pfile = open(self.save_folder + '/performance.txt', 'w')
        for i in range(1, epochs):
            stime = time.time()

            # Rollouts
            # 5 trajectories, each 20 long
            stime = time.time()
            traj_list = []
            for n_iter in range(5):
                problem = ConveyorBelt()  # different "initial" state
                traj = problem.execute_policy(self, 20, self.v)
                traj_list.append(traj)
                problem.env.Destroy()
                RaveDestroy()
            avg_J = np.mean([np.sum(traj['r']) for traj in traj_list])
            std_J = np.std([np.sum(traj['r']) for traj in traj_list])
            pfile = open(self.save_folder + '/performance.txt', 'a')
            pfile.write(str(avg_J) + ',' + str(std_J) + '\n')
            pfile.close()
            print 'Score of this policy', avg_J

            # new rollout dataset
            new_s, new_a, new_r, new_sprime, new_sumR, _, new_traj_lengths = format_RL_data(
                traj_list)
            new_a = self.a_scaler.transform(new_a)

            # choose a batch of data
            indices = np.random.randint(0, actions.shape[0], size=BATCH_SIZE)
            s_batch = np.array(states[indices, :])  # collision vector
            a_batch = np.array(actions[indices, :])

            pi_indices = np.random.randint(0, new_a.shape[0], size=BATCH_SIZE)
            pi_s_batch = np.array(new_s[pi_indices, :])  # collision vector
            pi_a_batch = np.array(new_a[pi_indices, :])

            # make their scores
            fake_scores = np.zeros((BATCH_SIZE, 1))
            real_scores = np.ones((BATCH_SIZE, 1))
            batch_x = np.vstack([pi_a_batch, a_batch])
            batch_w = np.vstack([pi_s_batch, s_batch])
            batch_scores = np.vstack([fake_scores, real_scores])

            # Update  D
            self.disc.fit({
                'x': batch_x,
                'w': batch_w
            },
                          batch_scores,
                          epochs=1,
                          verbose=False)
            new_r, new_sumR = self.compute_r_using_D(traj_list)

            # update value function
            self.update_V(new_s, new_sumR)

            # update policy
            new_sumA = self.compute_A(new_s, new_a, new_sprime, new_r,
                                      new_traj_lengths)
            self.update_pi(new_s, new_a, new_sumA)

            self.saveWeights(additional_name='epoch_'+\
                            str(i)+'_'+str(avg_J))

            print 'Completed: %.2f%%' % (i / float(epochs) * 100)
            print "Epoch took: %.2fs" % (time.time() - stime)
예제 #6
0
    def train(self,states,actions,rewards,sprimes,\
              epochs=500,d_lr=1e-3,g_lr=1e-4):
        states = states.squeeze()
        sprimes = sprimes.squeeze()
        true_performance_list = []
        G_performance_list = []
        mse_list = []

        n_data = states.shape[0]
        BATCH_SIZE = np.min([32, int(len(actions) * 0.1)])
        if BATCH_SIZE == 0:
            BATCH_SIZE = 1
        print BATCH_SIZE

        K.set_value(self.opt_G.lr, g_lr)
        K.set_value(self.opt_D.lr, d_lr)
        print self.opt_G.get_config()

        current_best_J = -np.inf
        pfile = open(self.save_folder + '/performance.txt', 'w')

        # n_episodes = epochs*5
        # T = 20, but we update it once we finish executing all T
        # This is because this is an episodic task - you can only learn meaningful moves
        # if you go deep in the trajectory.
        # So, we have 300*5*20 RL data
        for i in range(1, epochs):
            print 'Completed: %.2f%%' % (i / float(epochs) * 100)
            stime = time.time()

            terminal_state_idxs = np.where(
                np.sum(np.sum(sprimes, axis=-1), axis=-1) == 0)[0]
            nonterminal_mask = np.ones((sprimes.shape[0], 1))
            nonterminal_mask[terminal_state_idxs, :] = 0

            # make the targets
            fake = self.a_gen.predict([sprimes])  # predicted by pi
            real = actions

            real_targets = rewards + np.multiply(
                self.disc.predict([fake, sprimes]), nonterminal_mask)
            stime = time.time()
            self.update_disc(real, states, real_targets, BATCH_SIZE)
            self.update_pi(states, BATCH_SIZE)
            print 'Fitting time', time.time() - stime

            # Technically speaking, we should update the policy every timestep.
            # What if we update it 100 times after we executed 5 episodes, each with 20 timesteps??
            stime = time.time()
            traj_list = []
            for n_iter in range(5):
                problem = ConveyorBelt()  # different "initial" state
                traj = problem.execute_policy(self, 20, self.v)
                traj_list.append(traj)
                problem.env.Destroy()
                RaveDestroy()
            avg_J = np.mean([np.sum(traj['r']) for traj in traj_list])
            std_J = np.std([np.sum(traj['r']) for traj in traj_list])
            pfile = open(self.save_folder + '/performance.txt', 'a')
            pfile.write(str(i) + ',' + str(avg_J) + ',' + str(std_J) + '\n')
            pfile.close()
            print 'Score of this policy', avg_J

            # Add new data to the buffer - only if this was a non-zero trajectory
            if avg_J > 1.0:
                new_s, new_a, new_r, new_sprime, new_sumR, _, new_traj_lengths = format_RL_data(
                    traj_list)
                new_a = self.a_scaler.transform(new_a)
                states = np.r_[states, new_s.squeeze()]
                actions = np.r_[actions, new_a]
                rewards = np.r_[rewards, new_r]
                sprimes = np.r_[sprimes, new_sprime.squeeze()]
                print "Rollout time", time.time() - stime

            if avg_J > current_best_J:
                current_best_J = avg_J
                theta_star = self.save_folder + '/policy_search_' + str(
                    i) + '.h5'
                self.saveWeights(additional_name='tau_'+str(self.tau)+'epoch_'+\
                                str(i)+'_'+str(avg_J))