Пример #1
0
def train_once(graph: nx.Graph, clusters: list, pos: dict, env_name: str='Controller-Select-v0', compute_optimal: bool=True, trained_model: DQN=None, steps: int=2e5, logdir: str='train_log_compare', env_kwargs: dict={}) -> (DQN, float, float):
	"""
	Main training loop. Initializes RL environment, performs training, and outputs results
	Args:
		graph (nx.Graph): NetworkX graph to train on
		clusters (list): List of lists of nodes in each cluster
		pos (dict): Graph rendering positions
		env_name (str): Name of Gym environment
		compute_optimal (bool): Whether to compute optimal set of controllers by brute-force
		trained_model (DQN): Provide starting model to train on
	Return:
		Trained model
	"""
	# Selecting controllers one-at-a-time environment
	env = gym.make(env_name, graph=graph, clusters=clusters, pos=pos, **env_kwargs)
	heuristic_controllers, heuristic_distance = env.compute_greedy_heuristic()
	print("WMSCP Greedy Heuristic: {}, {}".format(heuristic_controllers, heuristic_distance))
	#for i in range(1000):
	#	env.reset()
	#	print(env.graph.size(weight='weight'))
	orig_graph = env.original_graph
	optimal_controllers = None
	if compute_optimal:
		print("Computing optimal!")
		optimal_controllers = env.calculateOptimal()


	# Generate custom replay buffer full of valid experiences to speed up exploration of training
	def add_wrapper(replay_buffer):
		# Replay buffer maxsize is by default 50000. Should this be lowered?
		# valid_controllers_set = [env._random_valid_controllers() for i in range(int(replay_buffer._maxsize * 0.5 / len(clusters)))]
		# Uses heuristic controller set as innitial 'random' controllers
		valid_controllers_set = env.graphCentroidAction()
	
		for valid_controllers in valid_controllers_set:
			obs_current = env.reset()  # Really strange issue - obs_current follows the change in env.state, making it equal to obs!
			for controller in valid_controllers:
				(obs, rew, done, _) = env.step(controller)
				replay_buffer.add(obs_current, controller, rew, obs, done)  # For some reason, obs is a pointer which ends up being the very last obs before reset, so need to copy
				obs_current = obs.copy()
		return replay_buffer

	# Agent
	model = None
	if trained_model is None:
		print("Creating new training model!")
		model = DQN(LnMlpPolicy, env, tensorboard_log=logdir, verbose=0, full_tensorboard_log=True, exploration_initial_eps=0.5, exploration_fraction=0.2, learning_starts=0, target_network_update_freq=100, batch_size=32, learning_rate=0.00025)
	else:
		print("Using provided training model!")
		model = trained_model
		model.set_env(env)
		model.tensorboard_log = logdir

	# Train the agent
	print("Training!")
	model.learn(total_timesteps=int(steps))#, callback=callback)#, replay_wrapper=add_wrapper)

	# Run a single run to evaluate the DQN
	obs = env.reset()
	reward = 0 #We want the last reward to be minimal (perhaps instead do cumulative?)
	reward_final = 0
	done = False
	action = None
	final_rl_actions = []
	while not done:
		action, states = model.predict(obs)
		(obs, rew, done, _) = env.step(action)
		final_rl_actions.append(action)
		reward += rew
		reward_final = rew

	# Show controllers chosen by the model
	env.render(mode='graph_end.png')
	print(env.controllers, reward_final)
	print("BEST EVER:")
	print(env.best_controllers, env.best_reward)
	best_reward = env.optimal_neighbors(graph, env.best_controllers)
	print(best_reward)

	average_graph = env.average_graph.copy()
	rl_controllers = env.controllers
	rl_best_controllers = env.best_controllers
	if env_name == 'Controller-Cluster-v0':
		rl_controllers.sort()
		rl_best_controllers.sort()
		cluster_len = len(clusters[0])
		for i in range(len(clusters)):
			rl_controllers[i] -= i * cluster_len
			rl_best_controllers[i] -= i * cluster_len
	env.reset(adjust=False, full=True)
	nx.write_gpickle(average_graph, 'average_graph.gpickle')
	env.graph = average_graph.copy()
	for cont in rl_controllers:
		(_, reward_final, _, _) = env.step(cont)
	print("RL Controllers on average change graph {} - {}".format(env.controllers, reward_final))
	env.reset(adjust=False, full=True)
	env.graph = average_graph.copy()
	for cont in rl_best_controllers:
		(_, reward_final, _, _) = env.step(cont)
	print("RL Best Controllers on average change graph {} - {}".format(env.best_controllers, reward_final))
	# Show controllers chosen using heuristic
	centroid_controllers, heuristic_distance = env.graphCentroidAction()
	#centroid_controllers, heuristic_distance = env.compute_greedy_heuristic()
	# Convert heuristic controllers to actual
	if env_name == 'Controller-Cluster-v0' or env_name == 'Controller-Cluster-Options-v0':
		# Assume all clusters same length
		centroid_controllers.sort()
		cluster_len = len(clusters[0])
		for i in range(len(clusters)):
			centroid_controllers[i] -= i * cluster_len
	env.reset(adjust=False, full=True)
	env.graph = average_graph.copy()
	for cont in centroid_controllers:
		(_, reward_final, _, _) = env.step(cont)
	env.render(mode='graph_heuristic.png')
	best_heuristic = reward_final
	print("Heuristic on average change graph {} - {}".format(env.controllers, reward_final))
	#print("Heuristic optimal {} - {}".format(*env.optimal_neighbors(graph,  env.controllers)))
	heuristic_controllers = env.controllers

	rl_rewards = []
	heuristic_rewards = []
	rl_best_rewards = []
	NUM_GRAPHS = 100
	for i in range(NUM_GRAPHS):
		rl_reward = None
		heuristic_reward = None
		rl_best_reward = None
		env.reset()
		nx.write_gpickle(env.graph, '100Graphs/graph_{}.gpickle'.format(i))
		for cont in final_rl_actions:
			(_, rl_reward, _, _) = env.step(cont)
		env.reset(adjust=False, full=False)
		for cont in centroid_controllers:
			(_, heuristic_reward, _, _) = env.step(cont)
		env.reset(adjust=False, full=False)
		for cont in rl_best_controllers:
			(_, rl_best_reward, _, _) = env.step(cont)
		print("RL REWARD, RL BEST REWARD, HEURISTIC: {}\t{}\t{}".format(rl_reward, rl_best_reward, heuristic_reward))
		rl_rewards.append(rl_reward)
		heuristic_rewards.append(heuristic_reward)
		rl_best_rewards.append(rl_best_reward)

	def create_hist(fig, data, title=None, color=None):
		bins = np.arange(min(data) - 100, max(data) + 100, 100)
		plt.xlim([min(data) - 100, max(data) + 100])
		fig.hist(data, bins=bins, alpha=0.5, color=color)
		if title:
			fig.title(title)
		plt.xlabel('Controller Distances')
		plt.ylabel('Count')
	fig = plt.figure()
	ax1 = fig.add_subplot(2, 1, 1)
	create_hist(ax1, rl_rewards, color='blue')
	create_hist(ax1, heuristic_rewards, color='red')
	create_hist(ax1, rl_best_rewards, color='green')
	ax2 = fig.add_subplot(2, 1, 2)
	ax2.plot(np.arange(0, NUM_GRAPHS, 1), rl_rewards, c='blue')
	ax2.plot(np.arange(0, NUM_GRAPHS, 1), heuristic_rewards, c='red')
	ax2.plot(np.arange(0, NUM_GRAPHS, 1), rl_best_rewards, c='green')
	plt.show()
	# Show optimal
	if optimal_controllers is not None:
		env.reset()
		for cont in optimal_controllers[0]:
			(_, reward_final, _, _) = env.step(cont)
		env.render(mode='graph_optimal.png')
		print(env.controllers, reward_final)
		print(optimal_controllers)
	return model, best_reward, best_heuristic
Пример #2
0
    def __init__(self, env: Env, params: dict, model_path: str, log_path: str):
        """Initialize.

        :param env: gym environment. Assuming observation space is a tuple,
            where first component is from original env, and the second is
            temporal goal state.
        :param params: dict of parameters, like `default_parameters`.
        :param model_path: directory where to save models.
        :param log_path: directory where to save tensorboard logs.
        """
        # Check
        if params["initialize_file"]:
            raise ValueError(
                "Initialization not supported; use resuming option")
        if params["action_bias"]:
            raise ValueError("Action bias is not maintained here")

        # Alias
        original_env = env

        # Load a saved agent for the action bias
        self.biased_agent: Optional[DQN] = None
        if params["action_bias"]:
            loading_params = dict(params)
            loading_params["resume_file"] = params["action_bias"]
            loading_params["action_bias"] = None

            self.biased_agent = TrainStableBaselines(
                env=env,
                params=loading_params,
                model_path=model_path,
                log_path=log_path,
            ).model

        # Collect statistics
        #    (assuming future wrappers do not modify episodes)
        env = MyStatsRecorder(env=env, gamma=params["gamma"])

        # Callbacks
        checkpoint_callback = CustomCheckpointCallback(
            save_path=model_path,
            save_freq=params["save_freq"],
            extra=None,
        )
        stats_logger_callback = StatsLoggerCallback(stats_recorder=env,
                                                    scope="env0")

        callbacks_list = [checkpoint_callback, stats_logger_callback]
        if params["render"]:
            renderer_callback = RendererCallback()
            callbacks_list.append(renderer_callback)

        # If training a passive agent log this too
        if params["active_passive_agents"]:

            # Find the reward shaping env
            reward_shaping_env = find_wrapper(env, RewardShapingWrapper)

            passive_stats_env = MyStatsRecorder(
                env=UnshapedEnv(reward_shaping_env),
                gamma=params["gamma"],
            )

            passive_stats_callback = StatsLoggerCallback(
                stats_recorder=passive_stats_env,
                scope="env1",
            )
            callbacks_list.append(passive_stats_callback)

            # Make it move with the original env
            env = UnshapedEnvWrapper(
                shaped_env=env,
                unshaped_env=passive_stats_env,
            )
            original_reward_getter = env.get_reward  # alias
        else:
            original_reward_getter = None

        # Combine callbacks
        all_callbacks = CallbackList(callbacks_list)

        # Define or load
        resuming = bool(params["resume_file"])
        if not resuming:
            # Normalizer
            normalized_env = NormalizeEnvWrapper(
                env=env,
                training=True,
                entry=0,  # Only env features, not temporal goal state
            )
            flat_env = BoxAutomataStates(normalized_env)
            # Saving normalizer too
            checkpoint_callback.saver.extra_model = normalized_env

            # Agent
            model = DQN(
                env=flat_env,
                policy=ModularPolicy,
                policy_kwargs={
                    "layer_norm": params["layer_norm"],
                    "layers": params["layers"],
                    "shared_layers": params["shared_layers"],
                    "dueling": params["dueling"],
                },
                gamma=params["gamma"],
                learning_rate=params["learning_rate"],
                train_freq=params["train_freq"],
                double_q=True,
                batch_size=params["batch_size"],
                buffer_size=params["buffer_size"],
                learning_starts=params["learning_starts"],
                prioritized_replay=True,
                target_network_update_freq=params[
                    "target_network_update_freq"],
                exploration_fraction=params["exploration_fraction"],
                exploration_final_eps=params["exploration_final_eps"],
                exploration_initial_eps=params["exploration_initial_eps"],
                active_passive_agents=params["active_passive_agents"],
                passive_reward_getter=original_reward_getter,
                tensorboard_log=log_path,
                full_tensorboard_log=False,
                verbose=1,
            )
        else:
            # Reload model
            model, extra_model, counters = checkpoint_callback.load(
                path=params["resume_file"], )

            # Restore normalizer and env
            normalized_env = extra_model
            normalized_env.set_env(env)
            flat_env = BoxAutomataStates(normalized_env)

            # Restore properties
            model.tensorboard_log = log_path
            model.num_timesteps = counters["step"]
            model.learning_starts = params["learning_starts"] + counters["step"]
            model.set_env(flat_env)
            model.passive_reward_getter = original_reward_getter

        # Store
        self.params = params
        self.resuming = resuming
        self.saver = checkpoint_callback
        self.logger = stats_logger_callback
        self.callbacks = all_callbacks
        self.model: DQN = model
        self.normalized_env = normalized_env
        self.testing_agent = model if not params[
            "test_passive"] else model.passive_agent