示例#1
0
    def run_iters(self):
        T = self.params['t']

        results = {
            'rewards': [],
            'sup_rewards': [],
            'surr_losses': [],
            'sup_losses': [],
            'sim_errs': [],
            'data_used': [],
        }
        trajs = []

        beta = self.params['beta']

        snapshots = []
        for i in range(self.params['iters'][-1]):
            print "\tIteration: " + str(i)

            if i in self.params['update']:
                self.lnr.train(verbose=True)

            if i == 0:
                states, i_actions, _, _ = statistics.collect_traj(
                    self.env, self.sup, T, False)
                trajs.append((states, i_actions))
                states, i_actions, _ = utils.filter_data(
                    self.params, states, i_actions)
                self.lnr.add_data(states, i_actions)
                self.lnr.train()

            else:
                states, _, _, _ = statistics.collect_traj_beta(
                    self.env, self.sup, self.lnr, T, beta, False)
                i_actions = [self.sup.intended_action(s) for s in states]
                states, i_actions, _ = utils.filter_data(
                    self.params, states, i_actions)
                self.lnr.add_data(states, i_actions)
                beta = beta * self.params['beta']

            if ((i + 1) in self.params['iters']):
                snapshots.append((self.lnr.X[:], self.lnr.y[:]))

        for j in range(len(snapshots)):
            X, y = snapshots[j]
            self.lnr.X, self.lnr.y = X, y
            self.lnr.train(verbose=True)
            print "\nData from snapshot: " + str(self.params['iters'][j])
            it_results = self.iteration_evaluation()

            results['sup_rewards'].append(it_results['sup_reward_mean'])
            results['rewards'].append(it_results['reward_mean'])
            results['surr_losses'].append(it_results['surr_loss_mean'])
            results['sup_losses'].append(it_results['sup_loss_mean'])
            results['sim_errs'].append(it_results['sim_err_mean'])
            results['data_used'].append(len(y))

        for key in results.keys():
            results[key] = np.array(results[key])
        return results
示例#2
0
    def run_iters(self):
        T = self.params['t']

        results = {
            'rewards': [],
            'sup_rewards': [],
            'surr_losses': [],
            'sup_losses': [],
            'sim_errs': [],
            'data_used': [],
            'biases': [],
            'variances': [],
            'biases_learner': [],
            'variances_learner': [],
            'covariate_shifts': []
        }

        snapshots = []
        for i in range(self.params['iters'][-1]):
            print "\tIteration: " + str(i)

            states, i_actions, _, _ = statistics.collect_traj(
                self.env, self.sup, T, False)
            states, i_actions, _ = utils.filter_data(self.params, states,
                                                     i_actions)
            self.lnr.add_data(states, i_actions)

            if ((i + 1) in self.params['iters']):
                snapshots.append((self.lnr.X[:], self.lnr.y[:]))

        for j in range(len(snapshots)):
            X, y = snapshots[j]
            self.lnr.X, self.lnr.y = X, y
            self.lnr.train(verbose=True)
            print "\nData from snapshot: " + str(self.params['iters'][j])
            it_results = self.iteration_evaluation()

            results['sup_rewards'].append(it_results['sup_reward_mean'])
            results['rewards'].append(it_results['reward_mean'])
            results['surr_losses'].append(it_results['surr_loss_mean'])
            results['sup_losses'].append(it_results['sup_loss_mean'])
            results['sim_errs'].append(it_results['sim_err_mean'])
            results['biases'].append(it_results['biases_mean'])
            results['variances'].append(it_results['variances_mean'])
            results['biases_learner'].append(it_results['biases_learner_mean'])
            results['variances_learner'].append(
                it_results['variances_learner_mean'])
            results['covariate_shifts'].append(
                it_results['covariate_shifts_mean'])
            results['data_used'].append(len(y))

        for key in results.keys():
            results[key] = np.array(results[key])
        return results
示例#3
0
    def run_iters(self):
        T = self.params['t']

        results = {
            'rewards': [],
            'sup_rewards': [],
            'surr_losses': [],
            'sup_losses': [],
            'sim_errs': [],
            'data_used': [],
        }
        trajs = []

        d = self.params['d']
        new_cov = np.random.normal(0, 1, (d, d))
        new_cov = new_cov.T.dot(new_cov)
        new_cov = new_cov / np.trace(new_cov) * self.params['trace']
        self.sup = GaussianSupervisor(self.net_sup, new_cov)

        snapshots = []
        for i in range(self.params['iters'][-1]):
            print "\tIteration: " + str(i)


            states, i_actions, _, _ = statistics.collect_traj(self.env, self.sup, T, False)
            trajs.append((states, i_actions))
            states, i_actions, _ = utils.filter_data(self.params, states, i_actions)
            
            self.lnr.add_data(states, i_actions)

            if ((i + 1) in self.params['iters']):
                snapshots.append((self.lnr.X[:], self.lnr.y[:]))

        for j in range(len(snapshots)):
            X, y = snapshots[j]
            self.lnr.X, self.lnr.y = X, y
            self.lnr.train(verbose=True)
            print "\nData from snapshot: " + str(self.params['iters'][j])
            it_results = self.iteration_evaluation()
            
            results['sup_rewards'].append(it_results['sup_reward_mean'])
            results['rewards'].append(it_results['reward_mean'])
            results['surr_losses'].append(it_results['surr_loss_mean'])
            results['sup_losses'].append(it_results['sup_loss_mean'])
            results['sim_errs'].append(it_results['sim_err_mean'])
            results['data_used'].append(len(y))


        for key in results.keys():
            results[key] = np.array(results[key])
        return results
示例#4
0
    def run_iters(self):
        T = self.params['t']
        partition = self.params['partition']

        results = {
            'rewards': [],
            'sup_rewards': [],
            'surr_losses': [],
            'sup_losses': [],
            'sim_errs': [],
            'data_used': [],
        }
        start_time = timer.time()

        trajs = []
        traj_snapshots = []
        self.optimized_data = 0

        data_states = []
        data_actions = []

        train_states = []
        train_i_actions = []

        supervisors = []

        iteration = 0
        last_data_update = 0

        while len(data_states) < self.params['max_data']:
            log("\tIteration: " + str(iteration))
            log("\tData states: " + str(len(data_states)))
            assert (len(data_states) == len(data_actions))

            states, i_actions, _, _ = statistics.collect_traj(
                self.env, self.sup, T, False)
            states, i_actions, _ = utils.filter_data(self.params, states,
                                                     i_actions)

            data_states += states
            data_actions += i_actions
            supervisors += [self.sup] * len(states)

            rang = np.arange(0, len(states))
            np.random.shuffle(rang)

            partition_cutoff = int(partition * len(states))
            noise_states, noise_actions = [
                states[k] for k in rang[:partition_cutoff]
            ], [i_actions[k] for k in rang[:partition_cutoff]]
            states, i_actions = [states[k] for k in rang[partition_cutoff:]], [
                i_actions[k] for k in rang[partition_cutoff:]
            ]

            train_states += states
            train_i_actions += i_actions

            self.lnr.set_data(train_states, train_i_actions)
            trajs.append((noise_states, noise_actions))

            if iteration == 0 or len(data_states) >= (
                    last_data_update + self.params['update_period']):
                self.sup = self.update_noise(iteration, trajs)

                difference = (len(data_states) -
                              last_data_update) / self.params['update_period']
                last_data_update += difference * self.params['update_period']

            iteration += 1

        end_time = timer.time()

        for sr in self.snapshot_ranges:
            # # Uncomment for actual evaluations
            snapshot_states = data_states[:sr]
            snapshot_actions = data_actions[:sr]

            self.lnr.set_data(snapshot_states, snapshot_actions)
            self.lnr.train(verbose=True)
            self.sup = supervisors[sr - 1]
            log("\nData from snapshot: " + str(sr))
            it_results = self.iteration_evaluation()

            results['sup_rewards'].append(it_results['sup_reward_mean'])
            results['rewards'].append(it_results['reward_mean'])
            results['surr_losses'].append(it_results['surr_loss_mean'])
            results['sup_losses'].append(it_results['sup_loss_mean'])
            results['sim_errs'].append(it_results['sim_err_mean'])
            results['data_used'].append(sr)

            # Uncomment for time trials
            # results['sup_rewards'].append(0)
            # results['rewards'].append(0)
            # results['surr_losses'].append(0)
            # results['sup_losses'].append(0)
            # results['sim_errs'].append(0)
            # results['data_used'].append(0)

        log("\tTrain data: " + str(len(train_i_actions)))
        log("\tNoise opt data: " + str(self.count_states(trajs)))

        for key in results.keys():
            results[key] = np.array(results[key])
        results['total_time'] = end_time - start_time
        return results
示例#5
0
    def run_iters(self):
        T = self.params['t']

        results = {
            'rewards': [],
            'sup_rewards': [],
            'surr_losses': [],
            'sup_losses': [],
            'sim_errs': [],
            'data_used': [],
            'biases': [],
            'variances': [],
            'biases_learner': [],
            'variances_learner': [],
            'covariate_shifts': []
        }

        trajs = []

        snapshots = []
        switch_idxs = []
        for i in range(self.params['iters'][-1]):
            print "\tIteration: " + str(i)

            if i == 0:
                states, i_actions, _, _ = statistics.collect_traj(
                    self.env, self.sup, T, False)
                trajs.append((states, i_actions))
                states, i_actions, _ = utils.filter_data(
                    self.params, states, i_actions)
                self.lnr.add_data(states, i_actions)
                self.lnr.train()

            else:
                post_switch_states, post_switch_sup_actions, pre_switch_states, switch_idx, _ = statistics.collect_traj_mixed(
                    self.env, self.sup, self.lnr, T, i,
                    self.params['iters'][-1], False)

                if self.params['dagger_mixed']:
                    i_actions_dagger = [
                        self.sup.intended_action(s) for s in pre_switch_states
                    ]
                    states = pre_switch_states + post_switch_states
                    i_actions = i_actions_dagger + post_switch_sup_actions
                else:
                    states = post_switch_states
                    i_actions = post_switch_sup_actions

                states, i_actions, _ = utils.filter_data(
                    self.params, states, i_actions)
                self.lnr.add_data(states, i_actions)
                self.lnr.train(verbose=True)

            if ((i + 1) in self.params['iters']):
                snapshots.append((self.lnr.X[:], self.lnr.y[:]))
                switch_idxs.append(switch_idx)

        for j in range(len(snapshots)):
            X, y = snapshots[j]
            self.lnr.X, self.lnr.y = X, y
            self.lnr.train(verbose=True)
            print "\nData from snapshot: " + str(self.params['iters'][j])
            it_results = self.iteration_evaluation(
                mixed_switch_idx=switch_idxs[j])

            results['sup_rewards'].append(it_results['sup_reward_mean'])
            results['rewards'].append(it_results['reward_mean'])
            results['surr_losses'].append(it_results['surr_loss_mean'])
            results['sup_losses'].append(it_results['sup_loss_mean'])
            results['sim_errs'].append(it_results['sim_err_mean'])
            results['biases'].append(it_results['biases_mean'])
            results['variances'].append(it_results['variances_mean'])
            results['biases_learner'].append(it_results['biases_learner_mean'])
            results['variances_learner'].append(
                it_results['variances_learner_mean'])
            results['covariate_shifts'].append(
                it_results['covariate_shifts_mean'])
            results['data_used'].append(len(y))

        for key in results.keys():
            results[key] = np.array(results[key])
        return results
示例#6
0
    def run_iters(self):
        T = self.params['t']
        partition = self.params['partition']

        results = {
            'rewards': [],
            'sup_rewards': [],
            'surr_losses': [],
            'sup_losses': [],
            'sim_errs': [],
            'data_used': [],
        }
        trajs = []
        snapshots = []
        traj_snapshots = []
        self.optimized_data = 0

        for i in range(self.params['iters'][-1]):
            print "\tIteration: " + str(i)

            self.sup = self.update_noise(i, trajs)

            states, i_actions, _, _ = statistics.collect_traj(
                self.env, self.sup, T, False)
            states, i_actions, (held_out_states,
                                held_out_actions) = utils.filter_data(
                                    self.params, states, i_actions)

            rang = np.arange(0, len(held_out_states))
            np.random.shuffle(rang)
            noise_states, noise_actions = [
                held_out_states[k] for k in rang[:partition]
            ], [held_out_actions[k] for k in rang[:partition]]

            trajs.append((noise_states, noise_actions))
            self.lnr.add_data(states, i_actions)

            if ((i + 1) in self.params['iters']):
                snapshots.append((self.lnr.X[:], self.lnr.y[:]))
                traj_snapshots.append(self.optimized_data)

        for j in range(len(snapshots)):
            X, y = snapshots[j]
            optimized_data = traj_snapshots[j]
            self.lnr.X, self.lnr.y = X, y
            self.lnr.train(verbose=True)
            print "\nData from snapshot: " + str(self.params['iters'][j])
            it_results = self.iteration_evaluation()

            results['sup_rewards'].append(it_results['sup_reward_mean'])
            results['rewards'].append(it_results['reward_mean'])
            results['surr_losses'].append(it_results['surr_loss_mean'])
            results['sup_losses'].append(it_results['sup_loss_mean'])
            results['sim_errs'].append(it_results['sim_err_mean'])
            results['data_used'].append(len(y) + optimized_data)
            print "\nTrain data: " + str(len(y))
            print "\n Optimize data: " + str(optimized_data)

        for key in results.keys():
            results[key] = np.array(results[key])
        return results
示例#7
0
    def run_iters(self):
        T = self.params['t']

        results = {
            'rewards': [],
            'sup_rewards': [],
            'surr_losses': [],
            'sup_losses': [],
            'sim_errs': [],
            'data_used': [],
        }
        start_time = timer.time()
        trajs = []

        beta = self.params['beta']

        data_states = []
        data_actions = []

        iteration = 0
        last_data_update = 0

        while len(data_states) < self.params['max_data']:
            log("\tIteration: " + str(iteration))
            log("\tData states: " + str(len(data_states)))
            assert (len(data_states) == len(data_actions))

            if iteration == 0:
                states, i_actions, _, _ = statistics.collect_traj(
                    self.env, self.sup, T, False)
                states, i_actions, _ = utils.filter_data(
                    self.params, states, i_actions)
            else:
                states, tmp_actions, _, _ = statistics.collect_traj_beta(
                    self.env, self.sup, self.lnr, T, beta, False)
                states, _, _ = utils.filter_data(self.params, states,
                                                 tmp_actions)
                i_actions = [self.sup.intended_action(s) for s in states]
                beta = beta * self.params['beta']

            data_states += states
            data_actions += i_actions

            self.lnr.set_data(data_states, data_actions)

            if iteration == 0 or len(data_states) >= (
                    last_data_update + self.params['update_period']):
                self.lnr.train(verbose=True)

                difference = (len(data_states) -
                              last_data_update) / self.params['update_period']
                last_data_update += difference * self.params['update_period']

            iteration += 1

        end_time = timer.time()

        for sr in self.snapshot_ranges:
            # # Uncomment for actual evaluations
            snapshot_states = data_states[:sr]
            snapshot_actions = data_actions[:sr]

            self.lnr.set_data(snapshot_states, snapshot_actions)
            self.lnr.train(verbose=True)
            log("\nData from snapshot: " + str(sr))
            it_results = self.iteration_evaluation()

            results['sup_rewards'].append(it_results['sup_reward_mean'])
            results['rewards'].append(it_results['reward_mean'])
            results['surr_losses'].append(it_results['surr_loss_mean'])
            results['sup_losses'].append(it_results['sup_loss_mean'])
            results['sim_errs'].append(it_results['sim_err_mean'])
            results['data_used'].append(sr)

            # Uncomment for time trials
            # results['sup_rewards'].append(0)
            # results['rewards'].append(0)
            # results['surr_losses'].append(0)
            # results['sup_losses'].append(0)
            # results['sim_errs'].append(0)
            # results['data_used'].append(0)

        for key in results.keys():
            results[key] = np.array(results[key])
        results['total_time'] = end_time - start_time
        return results
示例#8
0
    def run_iters(self):
        T = self.params['t']

        results = {
            'rewards': [],
            'sup_rewards': [],
            'surr_losses': [],
            'sup_losses': [],
            'sim_errs': [],
            'data_used': [],
        }

        start_time = timer.time()
        data_states = []
        data_actions = []

        iteration = 0
        while len(data_states) < self.params['max_data']:
            log("\tIteration: " + str(iteration))
            log("\tData states: " + str(len(data_states)))
            assert (len(data_states) == len(data_actions))

            states, i_actions, _, _ = statistics.collect_traj(
                self.env, self.sup, T, False)
            states, i_actions, _ = utils.filter_data(self.params, states,
                                                     i_actions)

            data_states += states
            data_actions += i_actions

            self.lnr.set_data(data_states, data_actions)

            iteration += 1

        end_time = timer.time()

        for sr in self.snapshot_ranges:

            # # Uncomment for actual evaluations
            snapshot_states = data_states[:sr]
            snapshot_actions = data_actions[:sr]

            self.lnr.set_data(snapshot_states, snapshot_actions)
            self.lnr.train(verbose=True)
            log("\nData from snapshot: " + str(sr))
            it_results = self.iteration_evaluation()

            results['sup_rewards'].append(it_results['sup_reward_mean'])
            results['rewards'].append(it_results['reward_mean'])
            results['surr_losses'].append(it_results['surr_loss_mean'])
            results['sup_losses'].append(it_results['sup_loss_mean'])
            results['sim_errs'].append(it_results['sim_err_mean'])
            results['data_used'].append(sr)

            # Uncomment for time trials
            # results['sup_rewards'].append(0)
            # results['rewards'].append(0)
            # results['surr_losses'].append(0)
            # results['sup_losses'].append(0)
            # results['sim_errs'].append(0)
            # results['data_used'].append(0)

        for key in results.keys():
            results[key] = np.array(results[key])
        results['total_time'] = end_time - start_time

        return results
    def run_iters(self):
        T = self.params['t']

        results = {
            'rewards': [],
            'sup_rewards': [],
            'surr_losses': [],
            'sup_losses': [],
            'sim_errs': [],
            'data_used': [],
            'biases': [],
            'variances': [],
            'biases_learner': [],
            'variances_learner': [],
            'covariate_shifts': []
        }

        trajs = []
        snapshots = []
        dist_gen_agents = []
        learner_bias, learner_variance = None, None

        for i in range(self.params['iters'][-1]):
            print "\tIteration: " + str(i)

            if i == 0:
                states, i_actions, _, _ = statistics.collect_traj(
                    self.env, self.sup, T, False)
                trajs.append((states, i_actions))
                states, i_actions, _ = utils.filter_data(
                    self.params, states, i_actions)
                self.lnr.add_data(states, i_actions)
                self.lnr.train()
                learner_last = False
                dist_gen_agent = self.sup
            else:
                # if was learner last time and variance > some quantity switch to supervisor
                if learner_last and float(learner_variance) / (
                        float(learner_bias) + float(learner_variance)
                ) > 0.5:  # TODO: can modify this threshold in various ways as see fit...
                    states, i_actions, _, _ = statistics.collect_traj(
                        self.env, self.sup, T, False)
                    trajs.append((states, i_actions))
                    states, i_actions, _ = utils.filter_data(
                        self.params, states, i_actions)
                    self.lnr.add_data(states, i_actions)
                    self.lnr.train()
                    learner_last = False
                    dist_gen_agent = self.sup
                else:
                    states, _, _, _ = statistics.collect_traj(
                        self.env, self.lnr, T, False)
                    i_actions = [self.sup.intended_action(s) for s in states]
                    states, i_actions, _ = utils.filter_data(
                        self.params, states, i_actions)
                    self.lnr.add_data(states, i_actions)
                    self.lnr.train(verbose=True)
                    learner_last = True
                    learner_bias, learner_variance = statistics.evaluate_bias_variance_learner_cont(
                        self.env, self.lnr, self.sup, T, num_samples=20)
                    dist_gen_agent = self.lnr

            if ((i + 1) in self.params['iters']):
                snapshots.append((self.lnr.X[:], self.lnr.y[:]))
                dist_gen_agents.append(dist_gen_agent)

        for j in range(len(snapshots)):
            X, y = snapshots[j]
            self.lnr.X, self.lnr.y = X, y
            self.lnr.train(verbose=True)
            print "\nData from snapshot: " + str(self.params['iters'][j])
            it_results = self.iteration_evaluation(
                dist_gen_agent=dist_gen_agents[j])

            results['sup_rewards'].append(it_results['sup_reward_mean'])
            results['rewards'].append(it_results['reward_mean'])
            results['surr_losses'].append(it_results['surr_loss_mean'])
            results['sup_losses'].append(it_results['sup_loss_mean'])
            results['sim_errs'].append(it_results['sim_err_mean'])
            results['biases'].append(it_results['biases_mean'])
            results['variances'].append(it_results['variances_mean'])
            results['biases_learner'].append(it_results['biases_learner_mean'])
            results['variances_learner'].append(
                it_results['variances_learner_mean'])
            results['covariate_shifts'].append(
                it_results['covariate_shifts_mean'])
            results['data_used'].append(len(y))

        for key in results.keys():
            results[key] = np.array(results[key])
        return results