Exemplo n.º 1
0
    def __init__(self, cfg):
        self.cfg = cfg

        if self.cfg.seed is not None:
            log.info('Settings fixed seed %d', self.cfg.seed)
            torch.manual_seed(self.cfg.seed)
            np.random.seed(self.cfg.seed)

        self.device = torch.device('cuda')

        self.train_step = self.env_steps = 0

        self.total_train_seconds = 0
        self.last_training_step = time.time()

        self.best_avg_reward = math.nan

        self.summary_rate_decay = LinearDecay([(0, 100), (1000000, 2000),
                                               (10000000, 10000)])
        self.last_summary_written = -1e9
        self.save_rate_decay = LinearDecay([(0, self.cfg.initial_save_rate),
                                            (1000000, 5000)],
                                           staircase=100)

        summary_dir = summaries_dir(experiment_dir(cfg=self.cfg))
        self.writer = SummaryWriter(summary_dir, flush_secs=10)
Exemplo n.º 2
0
    def __init__(self, params):
        super(AgentLearner, self).__init__(params)
        self.session = None  # actually created in "initialize" method
        self.saver = None

        tf.reset_default_graph()

        if self.params.seed >= 0:
            tf.random.set_random_seed(self.params.seed)

        self.summary_rate_decay = LinearDecay([(0, 100), (1000000, 2000),
                                               (10000000, 10000)],
                                              staircase=100)
        self.save_rate_decay = LinearDecay([(0, self.params.initial_save_rate),
                                            (1000000, 5000)],
                                           staircase=100)

        self.initial_best_avg_reward = tf.constant(-1e3)
        self.best_avg_reward = tf.Variable(self.initial_best_avg_reward)
        self.total_env_steps = tf.Variable(0, dtype=tf.int64)

        def update_best_value(best_value, new_value):
            return tf.assign(best_value, tf.maximum(new_value, best_value))

        self.avg_reward_placeholder = tf.placeholder(tf.float32, [],
                                                     'new_avg_reward')
        self.update_best_reward = update_best_value(
            self.best_avg_reward, self.avg_reward_placeholder)
        self.total_env_steps_placeholder = tf.placeholder(
            tf.int64, [], 'new_env_steps')
        self.update_env_steps = tf.assign(self.total_env_steps,
                                          self.total_env_steps_placeholder)

        summary_dir = summaries_dir(self.params.experiment_dir())
        self.summary_writer = tf.summary.FileWriter(summary_dir)

        self.position_histograms = deque(
            [], maxlen=self.params.num_position_histograms)

        self._last_trajectory_summary = 0  # timestamp of the latest trajectory summary written
        self._last_coverage_summary = 0  # timestamp of the latest coverage summary written

        self.map_img = self.coord_limits = None
Exemplo n.º 3
0
    def __init__(self, make_env_func, params):
        """Initialize PPO computation graph and some auxiliary tensors."""
        super(AgentRandom, self).__init__(params)

        self.make_env_func = make_env_func
        env = make_env_func()
        self.action_space = env.action_space
        env.close()

        self.objectives = None

        self.last_action = None
        self.best_reward = None

        summary_dir = summaries_dir(self.params.experiment_dir())
        self.summary_writer = tf.summary.FileWriter(summary_dir)

        if self.params.use_env_map:
            self.map_img, self.coord_limits = generate_env_map(make_env_func)
Exemplo n.º 4
0
    def __init__(self, make_env_func, params):
        """Initialize PPO computation graph and some auxiliary tensors."""
        super(AgentPPO, self).__init__(params)

        self.actor_step = tf.Variable(0, trainable=False, dtype=tf.int64, name='actor_step')
        self.critic_step = tf.Variable(0, trainable=False, dtype=tf.int64, name='critic_step')

        self.make_env_func = make_env_func
        env = make_env_func()  # we need the env to query observation shape, number of actions, etc.

        self.obs_shape = [-1] + list(main_observation_space(env).shape)
        self.ph_observations = placeholder_from_space(main_observation_space(env))
        self.ph_actions = placeholder_from_space(env.action_space)  # actions sampled from the policy
        self.ph_advantages, self.ph_returns, self.ph_old_action_probs = placeholders(None, None, None)

        self.actor_critic = ActorCritic(env, self.ph_observations, self.params)

        env.close()

        self.objectives = self.add_ppo_objectives(
            self.actor_critic,
            self.ph_actions, self.ph_old_action_probs, self.ph_advantages, self.ph_returns,
            self.params,
            self.actor_step,
        )

        # optimizers
        actor_opt = tf.train.AdamOptimizer(learning_rate=self.params.learning_rate, name='actor_opt')
        self.train_actor = actor_opt.minimize(self.objectives.actor_loss, global_step=self.actor_step)

        critic_opt = tf.train.AdamOptimizer(learning_rate=self.params.learning_rate, name='critic_opt')
        self.train_critic = critic_opt.minimize(self.objectives.critic_loss, global_step=self.critic_step)

        self.add_ppo_summaries()

        summary_dir = summaries_dir(self.params.experiment_dir())
        self.summary_writer = tf.summary.FileWriter(summary_dir)
        self.actor_summaries = merge_summaries(collections=['actor'])
        self.critic_summaries = merge_summaries(collections=['critic'])

        if self.params.use_env_map:
            self.map_img, self.coord_limits = generate_env_map(make_env_func)
Exemplo n.º 5
0
    def test_graph_tensorboard(self):
        graph = nx.DiGraph()
        for i in range(100):
            graph.add_node(i)

        for i in range(100):
            nonedges = list(nx.non_edges(graph))
            chosen_nonedge = random.choice(nonedges)
            graph.add_edge(chosen_nonedge[0], chosen_nonedge[1])

        params = AgentLearner.AgentParams('__test_graph__')
        summary_dir = summaries_dir(params.experiment_dir())

        summary_writer = tf.summary.FileWriter(summary_dir)

        start_summary = time.time()
        summary = visualize_graph_tensorboard(graph, tag='test/graph')
        log.debug('Took %.3f seconds to write graph summary',
                  time.time() - start_summary)
        summary_writer.add_summary(summary, global_step=1)

        shutil.rmtree(params.experiment_dir())
Exemplo n.º 6
0
    def __init__(self, cfg):
        super().__init__(cfg)

        # we should not use CUDA in the main thread, only on the workers
        set_global_cuda_envvars(cfg)

        tmp_env = make_env_func(self.cfg, env_config=None)
        self.obs_space = tmp_env.observation_space
        self.action_space = tmp_env.action_space
        self.num_agents = tmp_env.num_agents

        self.reward_shaping_scheme = None
        if self.cfg.with_pbt:
            if hasattr(tmp_env.unwrapped, '_reward_shaping_wrapper'):
                # noinspection PyProtectedMember
                self.reward_shaping_scheme = tmp_env.unwrapped._reward_shaping_wrapper.reward_shaping_scheme
            else:
                try:
                    from envs.doom.multiplayer.doom_multiagent_wrapper import MultiAgentEnv
                    if isinstance(tmp_env.unwrapped, MultiAgentEnv):
                        self.reward_shaping_scheme = tmp_env.unwrapped.default_reward_shaping
                except ImportError:
                    pass

        tmp_env.close()

        # shared memory allocation
        self.traj_buffers = SharedBuffers(self.cfg, self.num_agents,
                                          self.obs_space, self.action_space)

        self.actor_workers = None

        self.report_queue = MpQueue(20 * 1000 * 1000)
        self.policy_workers = dict()
        self.policy_queues = dict()

        self.learner_workers = dict()

        self.workers_by_handle = None

        self.policy_inputs = [[] for _ in range(self.cfg.num_policies)]
        self.policy_outputs = dict()
        for worker_idx in range(self.cfg.num_workers):
            for split_idx in range(self.cfg.worker_num_splits):
                self.policy_outputs[(worker_idx, split_idx)] = dict()

        self.policy_avg_stats = dict()
        self.policy_lag = [dict() for _ in range(self.cfg.num_policies)]

        self.last_timing = dict()
        self.env_steps = dict()
        self.samples_collected = [0 for _ in range(self.cfg.num_policies)]
        self.total_env_steps_since_resume = 0

        # currently this applies only to the current run, not experiment as a whole
        # to change this behavior we'd need to save the state of the main loop to a filesystem
        self.total_train_seconds = 0

        self.last_report = time.time()
        self.last_experiment_summaries = 0

        self.report_interval = 5.0  # sec
        self.experiment_summaries_interval = self.cfg.experiment_summaries_interval  # sec

        self.avg_stats_intervals = (2, 12, 60
                                    )  # 10 seconds, 1 minute, 5 minutes

        self.fps_stats = deque([], maxlen=max(self.avg_stats_intervals))
        self.throughput_stats = [
            deque([], maxlen=5) for _ in range(self.cfg.num_policies)
        ]
        self.avg_stats = dict()
        self.stats = dict()  # regular (non-averaged) stats

        self.writers = dict()
        writer_keys = list(range(self.cfg.num_policies))
        for key in writer_keys:
            summary_dir = join(summaries_dir(experiment_dir(cfg=self.cfg)),
                               str(key))
            summary_dir = ensure_dir_exists(summary_dir)
            self.writers[key] = SummaryWriter(summary_dir, flush_secs=20)

        self.pbt = PopulationBasedTraining(self.cfg,
                                           self.reward_shaping_scheme,
                                           self.writers)
def main():
    """Script entry point."""
    stop_at = 80 * 1000 * 1000
    prefix = 'simple'

    # noinspection PyUnusedLocal
    experiments_very_sparse = [
        Experiment('doom_curious_vs_vanilla/doom_maze_very_sparse/doom_maze_very_sparse_pre_0.0', 'A2C (no curiosity)'),
        Experiment('doom_sweep_very_sparse/doom_sweep_i_0.5_p_0.05', 'A2C+ICM (curious)'),
    ]

    # noinspection PyUnusedLocal
    experiments_sparse = [
        Experiment('doom_curious_vs_vanilla/doom_maze_sparse/doom_maze_sparse_pre_0.0', 'A2C (no curiosity)'),
        Experiment('doom_curious_vs_vanilla/doom_maze_sparse/doom_maze_sparse_pre_0.05', 'A2C+ICM (curious)'),
    ]

    # noinspection PyUnusedLocal
    experiments_basic = [
        Experiment('doom_curious_vs_vanilla/doom_maze/doom_maze_pre_0.0', 'A2C (no curiosity)'),
        Experiment('doom_curious_vs_vanilla/doom_maze/doom_maze_pre_0.05', 'A2C+ICM (curious)'),
    ]

    experiments = [
        Experiment('doom_curious_vs_vanilla/doom_basic/doom_basic_pre_0.0', 'A2C (no curiosity)'),
        Experiment('doom_curious_vs_vanilla/doom_basic/doom_basic_pre_0.05', 'A2C+ICM (curious)'),
    ]

    plots = [
        Plot('a2c_aux_summary/avg_reward', 'average reward', 'Avg. reward for the last 1000 episodes'),
        Plot(
            'a2c_agent_summary/policy_entropy',
            'policy entropy, nats',
            'Stochastic policy entropy',
        ),
    ]

    for plot in plots:
        fig = plt.figure(figsize=(5, 4))
        fig.add_subplot()

        for ex_i, experiment in enumerate(experiments):
            experiment_name = experiment.name.split(os.sep)[-1]
            experiments_root = join(*(experiment.name.split(os.sep)[:-1]))
            exp_dir = experiment_dir(experiment_name, experiments_root)

            path_to_events_dir = summaries_dir(exp_dir)
            events_files = []
            for f in os.listdir(path_to_events_dir):
                if f.startswith('events'):
                    events_files.append(join(path_to_events_dir, f))

            if len(events_files) == 0:
                log.error('No events file for %s', experiment)
                continue

            events_files = sorted(events_files)
            steps, values = [], []

            for events_file in events_files:
                iterator = tf.train.summary_iterator(events_file)
                while True:
                    try:
                        e = next(iterator, None)
                    except Exception as exc:
                        log.warning(str(exc))
                        break

                    if e is None:
                        break

                    for v in e.summary.value:
                        if e.step >= stop_at:
                            break

                        if v.tag == plot.name:
                            steps.append(e.step)
                            values.append(v.simple_value)

            # just in case
            values = np.nan_to_num(values)

            smooth = 10
            values_smooth = running_mean(values, smooth)
            steps = steps[smooth:]
            values = values[smooth:]

            plt.plot(steps, values, color=COLORS[ex_i], alpha=0.2, label='__nolegend__')
            plt.plot(steps, values_smooth, color=COLORS[ex_i], label=experiment.descr, linewidth=2)

        plt.xlabel('environment steps')
        plt.ylabel(plot.axis)
        plt.title(plot.descr)
        plt.grid(True)
        plt.legend()
        plt.tight_layout()

        plots_dir = ensure_dir_exists(join(experiments_dir(), 'plots'))
        plot_name = plot.name.replace('/', '_')
        plt.savefig(join(plots_dir, f'{prefix}_{plot_name}.png'))
        plt.close()

    return 0