def set_schedule_params(n_epochs, dataset_size): schedule_params = ScheduleParameters() # 100 epochs (we run train over all the dataset, every epoch) of training schedule_params.improve_steps = TrainingSteps(n_epochs) # we evaluate the model every epoch schedule_params.steps_between_evaluation_periods = TrainingSteps(1) # only for when we have an enviroment schedule_params.evaluation_steps = EnvironmentEpisodes(10) # to have it pure random we set the entire dataset to be created during heatup # does it mean pure random ? or is it using uninitialized network ? schedule_params.heatup_steps = EnvironmentSteps(dataset_size) return schedule_params
def __init__(self): # Architecture parameters self.use_accumulated_reward_as_measurement = False # Agent parameters self.num_consecutive_playing_steps = EnvironmentSteps(1) self.num_consecutive_training_steps = 1 # TODO: update this to TrainingSteps self.heatup_using_network_decisions = False self.discount = 0.99 self.apply_gradients_every_x_episodes = 5 self.num_steps_between_copying_online_weights_to_target = TrainingSteps( 0) self.rate_for_copying_weights_to_target = 1.0 self.load_memory_from_file_path = None self.store_transitions_only_when_episodes_are_terminated = False # HRL / HER related params self.in_action_space = None # distributed agents params self.share_statistics_between_workers = True # intrinsic reward self.scale_external_reward_by_intrinsic_reward_value = False # n-step returns self.n_step = -1 # calculate the total return (no bootstrap, by default) # Distributed Coach params self.distributed_coach_synchronization_type = None # Should the workers wait for full episode self.act_for_full_episodes = False
def __init__(self): # Architecture parameters self.use_accumulated_reward_as_measurement = False # Agent parameters self.num_consecutive_playing_steps = EnvironmentSteps(1) self.num_consecutive_training_steps = 1 # TODO: update this to TrainingSteps self.heatup_using_network_decisions = False self.discount = 0.99 self.apply_gradients_every_x_episodes = 5 self.num_steps_between_copying_online_weights_to_target = TrainingSteps( 0) self.rate_for_copying_weights_to_target = 1.0 self.load_memory_from_file_path = None self.collect_new_data = True self.store_transitions_only_when_episodes_are_terminated = False # HRL / HER related params self.in_action_space = None # distributed agents params self.share_statistics_between_workers = True # intrinsic reward self.scale_external_reward_by_intrinsic_reward_value = False
def set_agent_params(agent_params_func): ######### # Agent # ######### agent_params = agent_params_func() agent_params.network_wrappers['main'].batch_size = 128 agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps( 100) agent_params.algorithm.discount = 0.99 # to jump start the agent's q values, and speed things up, we'll initialize the last Dense layer's bias # with a number in the order of the discounted reward of a random policy agent_params.network_wrappers['main'].heads_parameters = \ [QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))] # agent_params.network_wrappers['main'].heads_parameters = \ # [QHeadParameters(output_bias_initializer=tf.constant_initializer(0))] # NN configuration agent_params.network_wrappers['main'].learning_rate = 0.0001 agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False agent_params.network_wrappers['main'].softmax_temperature = 0.2 # ER - we'll need an episodic replay buffer for off-policy evaluation agent_params.memory = EpisodicExperienceReplayParameters() # E-Greedy schedule - there is no exploration in Batch RL. Disabling E-Greedy. agent_params.exploration.epsilon_schedule = LinearSchedule(initial_value=0, final_value=0, decay_steps=1) agent_params.exploration.evaluation_epsilon = 0 return agent_params
def __init__(self, improve_steps=TrainingSteps(10000000000), steps_between_evaluation_periods=EnvironmentEpisodes(50), evaluation_steps=EnvironmentEpisodes(5)): super().__init__() self.heatup_steps = EnvironmentSteps(0) self.evaluation_steps = evaluation_steps self.steps_between_evaluation_periods = steps_between_evaluation_periods self.improve_steps = improve_steps
def __init__(self): super().__init__() self.rate_for_copying_weights_to_target = 0.005 self.use_target_network_for_evaluation = False self.action_penalty = 0 self.clip_critic_targets = None # expected to be a tuple of the form (min_clip_value, max_clip_value) or None self.use_non_zero_discount_for_terminal_states = False self.act_for_full_episodes = True self.update_policy_every_x_episode_steps = 2 self.num_steps_between_copying_online_weights_to_target = TrainingSteps(self.update_policy_every_x_episode_steps) self.policy_noise = 0.2 self.noise_clipping = 0.5 self.num_q_networks = 2
def __init__(self): # Architecture parameters self.use_accumulated_reward_as_measurement = False # Agent parameters self.num_consecutive_playing_steps = EnvironmentSteps(1) self.num_consecutive_training_steps = 1 # TODO: update this to TrainingSteps self.heatup_using_network_decisions = False self.discount = 0.99 self.apply_gradients_every_x_episodes = 5 self.num_steps_between_copying_online_weights_to_target = TrainingSteps( 0) self.rate_for_copying_weights_to_target = 1.0 self.load_memory_from_file_path = None self.store_transitions_only_when_episodes_are_terminated = False # HRL / HER related params self.in_action_space = None # distributed agents params self.share_statistics_between_workers = True # n-step returns self.n_step = -1 # calculate the total return (no bootstrap, by default) # Distributed Coach params self.distributed_coach_synchronization_type = None # Should the workers wait for full episode self.act_for_full_episodes = False # Support for parameter noise self.supports_parameter_noise = False # Override, in retrospective, all the episode rewards with the last reward in the episode # (sometimes useful for sparse, end of the episode, rewards problems) self.override_episode_rewards_with_the_last_transition_reward = False # Filters - TODO consider creating a FilterParameters class and initialize the filters with it self.update_pre_network_filters_state_on_train = False self.update_pre_network_filters_state_on_inference = True
def train_and_act(self, steps: StepMethod) -> None: """ Train the agent by doing several acting steps followed by several training steps continually :param steps: the number of steps as a tuple of steps time and steps count :return: None """ # perform several steps of training interleaved with acting if steps.num_steps > 0: self.phase = RunPhase.TRAIN count_end = self.total_steps_counters[self.phase][steps.__class__] + steps.num_steps self.reset_internal_state(force_environment_reset=True) #TODO - the below while loop should end with full episodes, so to avoid situations where we have partial # episodes in memory while self.total_steps_counters[self.phase][steps.__class__] < count_end: # The actual steps being done on the environment are decided by the agents themselves. # This is just an high-level controller. self.act(EnvironmentSteps(1)) self.train(TrainingSteps(1)) self.save_checkpoint() self.phase = RunPhase.UNDEFINED
VisualizationParameters, ) from rl_coach.core_types import EnvironmentEpisodes, EnvironmentSteps, RunPhase, TrainingSteps from rl_coach.environments.gym_environment import GymVectorEnvironment, mujoco_v2 from rl_coach.exploration_policies.e_greedy import EGreedyParameters from rl_coach.filters.filter import InputFilter from rl_coach.filters.reward.reward_rescale_filter import RewardRescaleFilter from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager from rl_coach.graph_managers.graph_manager import ScheduleParameters from rl_coach.schedules import LinearSchedule #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(int(5e5)) schedule_params.steps_between_evaluation_periods = EnvironmentSteps(50000) schedule_params.evaluation_steps = EnvironmentEpisodes(3) schedule_params.heatup_steps = EnvironmentSteps(150000) ######### # Agent # ######### agent_params = ActorCriticAgentParameters() agent_params.algorithm.policy_gradient_rescaler = PolicyGradientRescaler.GAE agent_params.input_filter = InputFilter() agent_params.input_filter.add_reward_filter("rescale", RewardRescaleFilter(1 / 10000.0)) agent_params.algorithm.num_steps_between_gradient_updates = 30 agent_params.algorithm.apply_gradients_every_x_episodes = 1 agent_params.algorithm.gae_lambda = 0.95
def get_graph_manager(hp_dict, agent_list, run_phase_subject): #################### # All Default Parameters # #################### params = {} params["batch_size"] = int(hp_dict.get("batch_size", 64)) params["num_epochs"] = int(hp_dict.get("num_epochs", 10)) params["stack_size"] = int(hp_dict.get("stack_size", 1)) params["lr"] = float(hp_dict.get("lr", 0.0003)) params["exploration_type"] = (hp_dict.get("exploration_type", "categorical")).lower() params["e_greedy_value"] = float(hp_dict.get("e_greedy_value", .05)) params["epsilon_steps"] = int(hp_dict.get("epsilon_steps", 10000)) params["beta_entropy"] = float(hp_dict.get("beta_entropy", .01)) params["discount_factor"] = float(hp_dict.get("discount_factor", .999)) params["loss_type"] = hp_dict.get("loss_type", "Mean squared error").lower() params["num_episodes_between_training"] = int( hp_dict.get("num_episodes_between_training", 20)) params["term_cond_max_episodes"] = int( hp_dict.get("term_cond_max_episodes", 100000)) params["term_cond_avg_score"] = float( hp_dict.get("term_cond_avg_score", 100000)) params_json = json.dumps(params, indent=2, sort_keys=True) print("Using the following hyper-parameters", params_json, sep='\n') #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps( params["term_cond_max_episodes"]) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### trainable_agents_list = list() non_trainable_agents_list = list() for agent in agent_list: agent_params = DeepRacerAgentParams() if agent.network_settings: agent_params.env_agent = agent agent_params.network_wrappers['main'].learning_rate = params["lr"] agent_params.network_wrappers['main'].input_embedders_parameters = \ create_input_embedder(agent.network_settings['input_embedders'], agent.network_settings['embedder_type'], agent.network_settings['activation_function']) agent_params.network_wrappers['main'].middleware_parameters = \ create_middle_embedder(agent.network_settings['middleware_embedders'], agent.network_settings['embedder_type'], agent.network_settings['activation_function']) input_filter = InputFilter(is_a_reference_filter=True) for observation in agent.network_settings['input_embedders'].keys( ): if observation == Input.LEFT_CAMERA.value or observation == Input.CAMERA.value or\ observation == Input.OBSERVATION.value: input_filter.add_observation_filter( observation, 'to_grayscale', ObservationRGBToYFilter()) input_filter.add_observation_filter( observation, 'to_uint8', ObservationToUInt8Filter(0, 255)) input_filter.add_observation_filter( observation, 'stacking', ObservationStackingFilter(1)) if observation == Input.STEREO.value: input_filter.add_observation_filter( observation, 'to_uint8', ObservationToUInt8Filter(0, 255)) if observation == Input.LIDAR.value: input_filter.add_observation_filter( observation, 'clipping', ObservationClippingFilter(0.15, 1.0)) if observation == Input.SECTOR_LIDAR.value: input_filter.add_observation_filter( observation, 'binary', ObservationBinarySectorFilter()) agent_params.input_filter = input_filter() agent_params.network_wrappers['main'].batch_size = params[ "batch_size"] agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5 agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999 if params["loss_type"] == "huber": agent_params.network_wrappers[ 'main'].replace_mse_with_huber_loss = True agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2 agent_params.algorithm.clipping_decay_schedule = LinearSchedule( 1.0, 0, 1000000) agent_params.algorithm.beta_entropy = params["beta_entropy"] agent_params.algorithm.gae_lambda = 0.95 agent_params.algorithm.discount = params["discount_factor"] agent_params.algorithm.optimization_epochs = params["num_epochs"] agent_params.algorithm.estimate_state_value_using_gae = True agent_params.algorithm.num_steps_between_copying_online_weights_to_target = \ EnvironmentEpisodes(params["num_episodes_between_training"]) agent_params.algorithm.num_consecutive_playing_steps = \ EnvironmentEpisodes(params["num_episodes_between_training"]) agent_params.algorithm.distributed_coach_synchronization_type = \ DistributedCoachSynchronizationType.SYNC if params["exploration_type"] == "categorical": agent_params.exploration = CategoricalParameters() else: agent_params.exploration = EGreedyParameters() agent_params.exploration.epsilon_schedule = LinearSchedule( 1.0, params["e_greedy_value"], params["epsilon_steps"]) trainable_agents_list.append(agent_params) else: non_trainable_agents_list.append(agent) ############### # Environment # ############### env_params = DeepRacerRacetrackEnvParameters() env_params.agents_params = trainable_agents_list env_params.non_trainable_agents = non_trainable_agents_list env_params.level = 'DeepRacerRacetrackEnv-v0' env_params.run_phase_subject = run_phase_subject vis_params = VisualizationParameters() vis_params.dump_mp4 = False ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 400 preset_validation_params.max_episodes_to_achieve_reward = 10000 graph_manager = MultiAgentGraphManager( agents_params=trainable_agents_list, env_params=env_params, schedule_params=schedule_params, vis_params=vis_params, preset_validation_params=preset_validation_params) return graph_manager, params_json
def get_graph_manager(**hp_dict): #################### # All Default Parameters # #################### params = {} params["batch_size"] = int(hp_dict.get("batch_size", 64)) params["num_epochs"] = int(hp_dict.get("num_epochs", 10)) params["stack_size"] = int(hp_dict.get("stack_size", 1)) params["lr"] = float(hp_dict.get("lr", 0.0003)) params["exploration_type"] = (hp_dict.get("exploration_type", "huber")).lower() params["e_greedy_value"] = float(hp_dict.get("e_greedy_value", .05)) params["epsilon_steps"] = int(hp_dict.get("epsilon_steps", 10000)) params["beta_entropy"] = float(hp_dict.get("beta_entropy", .01)) params["discount_factor"] = float(hp_dict.get("discount_factor", .999)) params["loss_type"] = hp_dict.get("loss_type", "Mean squared error").lower() params["num_episodes_between_training"] = int( hp_dict.get("num_episodes_between_training", 20)) params["term_cond_max_episodes"] = int( hp_dict.get("term_cond_max_episodes", 100000)) params["term_cond_avg_score"] = float( hp_dict.get("term_cond_avg_score", 100000)) params_json = json.dumps(params, indent=2, sort_keys=True) print("Using the following hyper-parameters", params_json, sep='\n') #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps( params["term_cond_max_episodes"]) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = ClippedPPOAgentParameters() agent_params.network_wrappers['main'].learning_rate = params["lr"] agent_params.network_wrappers['main'].input_embedders_parameters[ 'observation'].activation_function = 'relu' agent_params.network_wrappers[ 'main'].middleware_parameters.activation_function = 'relu' agent_params.network_wrappers['main'].batch_size = params["batch_size"] agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5 agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999 if params["loss_type"] == "huber": agent_params.network_wrappers[ 'main'].replace_mse_with_huber_loss = True agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2 agent_params.algorithm.clipping_decay_schedule = LinearSchedule( 1.0, 0, 1000000) agent_params.algorithm.beta_entropy = params["beta_entropy"] agent_params.algorithm.gae_lambda = 0.95 agent_params.algorithm.discount = params["discount_factor"] agent_params.algorithm.optimization_epochs = params["num_epochs"] agent_params.algorithm.estimate_state_value_using_gae = True agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentEpisodes( params["num_episodes_between_training"]) agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes( params["num_episodes_between_training"]) agent_params.algorithm.distributed_coach_synchronization_type = DistributedCoachSynchronizationType.SYNC if params["exploration_type"] == "categorical": agent_params.exploration = CategoricalParameters() else: agent_params.exploration = EGreedyParameters() agent_params.exploration.epsilon_schedule = LinearSchedule( 1.0, params["e_greedy_value"], params["epsilon_steps"]) ############### # Environment # ############### SilverstoneInputFilter = InputFilter(is_a_reference_filter=True) SilverstoneInputFilter.add_observation_filter('observation', 'to_grayscale', ObservationRGBToYFilter()) SilverstoneInputFilter.add_observation_filter( 'observation', 'to_uint8', ObservationToUInt8Filter(0, 255)) SilverstoneInputFilter.add_observation_filter( 'observation', 'stacking', ObservationStackingFilter(params["stack_size"])) env_params = GymVectorEnvironment() env_params.default_input_filter = SilverstoneInputFilter env_params.level = 'SilverstoneRacetrack-Discrete-v0' vis_params = VisualizationParameters() vis_params.dump_mp4 = False ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 400 preset_validation_params.max_episodes_to_achieve_reward = 1000 graph_manager = BasicRLGraphManager( agent_params=agent_params, env_params=env_params, schedule_params=schedule_params, vis_params=vis_params, preset_validation_params=preset_validation_params) return graph_manager, params_json
from rl_coach.filters.observation.observation_reduction_by_sub_parts_name_filter import \ ObservationReductionBySubPartsNameFilter from rl_coach.filters.observation.observation_rescale_to_size_filter import ObservationRescaleToSizeFilter from rl_coach.filters.observation.observation_to_uint8_filter import ObservationToUInt8Filter from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager from rl_coach.graph_managers.graph_manager import ScheduleParameters from rl_coach.schedules import ConstantSchedule from rl_coach.spaces import ImageObservationSpace from rl_coach.utilities.carla_dataset_to_replay_buffer import create_dataset #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(10000000000) schedule_params.steps_between_evaluation_periods = TrainingSteps(500) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ################ # Agent Params # ################ agent_params = CILAgentParameters() # forward camera and measurements input agent_params.network_wrappers['main'].input_embedders_parameters = { 'CameraRGB': InputEmbedderParameters(scheme=[Conv2d([32, 5, 2]), Conv2d([32, 3, 1]), Conv2d([64, 3, 2]), Conv2d([64, 3, 1]),
from rl_coach.memories.episodic import EpisodicExperienceReplayParameters from rl_coach.architectures.head_parameters import QHeadParameters from rl_coach.agents.ddqn_bcq_agent import DDQNBCQAgentParameters from rl_coach.agents.ddqn_bcq_agent import KNNParameters,NNImitationModelParameters DATASET_SIZE = 100000 #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() # schedule_params.improve_steps = TrainingSteps(10000000000) schedule_params.improve_steps = TrainingSteps(400) # 400 epochs schedule_params.steps_between_evaluation_periods = TrainingSteps(1) schedule_params.evaluation_steps = EnvironmentEpisodes(10) schedule_params.heatup_steps = EnvironmentSteps(DATASET_SIZE) ######### # Agent # ######### agent_params = DDQNBCQAgentParameters() agent_params.network_wrappers['main'].batch_size = 128 # TODO cross-DL framework abstraction for a constant initializer? agent_params.network_wrappers['main'].heads_parameters = [QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))] agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(100) # agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(500)
from rl_coach.agents.ddqn_agent import DDQNAgentParameters from rl_coach.base_parameters import TaskParameters from rl_coach.spaces import SpacesDefinition, DiscreteActionSpace, VectorObservationSpace, StateSpace, RewardSpace #################### # Graph Scheduling # #################### task_parameters = TaskParameters(experiment_path='./tmp', checkpoint_save_dir='./tmp') schedule_params = ScheduleParameters() # 100 epochs (we run train over all the dataset, every epoch) of training schedule_params.improve_steps = TrainingSteps(100) # we evaluate the model every epoch schedule_params.steps_between_evaluation_periods = TrainingSteps(1) tf.reset_default_graph() # just to clean things up; only needed for the tutorial ######### # Agent # ######### agent_params = DQNAgentParameters() agent_params.network_wrappers['main'].batch_size = 128 agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(100) agent_params.algorithm.discount = 0.99
def get_graph_manager(hp_dict, agent_list, run_phase_subject, enable_domain_randomization=False, done_condition=any, run_type=str(RunType.ROLLOUT_WORKER), pause_physics=None, unpause_physics=None): #################### # Hyperparameters # #################### training_algorithm = agent_list[ 0].ctrl.model_metadata.training_algorithm if agent_list else None params = get_updated_hyper_parameters(hp_dict, training_algorithm) params_json = json.dumps(params, indent=2, sort_keys=True) print("Using the following hyper-parameters", params_json, sep='\n') #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps( params[HyperParameterKeys.TERMINATION_CONDITION_MAX_EPISODES.value]) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### trainable_agents_list = list() non_trainable_agents_list = list() for agent in agent_list: if agent.network_settings: if TrainingAlgorithm.SAC.value == training_algorithm: agent_params = get_sac_params(DeepRacerSACAgentParams(), agent, params, run_type) else: agent_params = get_clipped_ppo_params( DeepRacerClippedPPOAgentParams(), agent, params) agent_params.env_agent = agent input_filter = InputFilter(is_a_reference_filter=True) for observation in agent.network_settings['input_embedders'].keys( ): if observation == Input.LEFT_CAMERA.value or observation == Input.CAMERA.value or\ observation == Input.OBSERVATION.value: input_filter.add_observation_filter( observation, 'to_grayscale', ObservationRGBToYFilter()) input_filter.add_observation_filter( observation, 'to_uint8', ObservationToUInt8Filter(0, 255)) input_filter.add_observation_filter( observation, 'stacking', ObservationStackingFilter(1)) if observation == Input.STEREO.value: input_filter.add_observation_filter( observation, 'to_uint8', ObservationToUInt8Filter(0, 255)) if observation == Input.LIDAR.value: input_filter.add_observation_filter( observation, 'clipping', ObservationClippingFilter(0.15, 1.0)) if observation == Input.SECTOR_LIDAR.value: input_filter.add_observation_filter( observation, 'binary', ObservationBinarySectorFilter()) agent_params.input_filter = input_filter() trainable_agents_list.append(agent_params) else: non_trainable_agents_list.append(agent) ############### # Environment # ############### env_params = DeepRacerRacetrackEnvParameters() env_params.agents_params = trainable_agents_list env_params.non_trainable_agents = non_trainable_agents_list env_params.level = 'DeepRacerRacetrackEnv-v0' env_params.run_phase_subject = run_phase_subject env_params.enable_domain_randomization = enable_domain_randomization env_params.done_condition = done_condition env_params.pause_physics = pause_physics env_params.unpause_physics = unpause_physics vis_params = VisualizationParameters() vis_params.dump_mp4 = False ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 400 preset_validation_params.max_episodes_to_achieve_reward = 10000 graph_manager = MultiAgentGraphManager( agents_params=trainable_agents_list, env_params=env_params, schedule_params=schedule_params, vis_params=vis_params, preset_validation_params=preset_validation_params, done_condition=done_condition) return graph_manager, params_json
from rl_coach.base_parameters import VisualizationParameters, EmbedderScheme, \ PresetValidationParameters from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps from rl_coach.environments.gym_environment import GymVectorEnvironment from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager from rl_coach.graph_managers.graph_manager import ScheduleParameters from rl_coach.memories.memory import MemoryGranularity from rl_coach.schedules import ConstantSchedule bit_length = 8 #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(400000) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes( 16 * 50) # 50 cycles schedule_params.evaluation_steps = EnvironmentEpisodes(10) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = DQNAgentParameters() agent_params.network_wrappers['main'].learning_rate = 0.001 agent_params.network_wrappers['main'].batch_size = 128 agent_params.network_wrappers['main'].middleware_parameters.scheme = [ Dense(256) ] agent_params.network_wrappers['main'].input_embedders_parameters = {
from rl_coach.filters.filter import InputFilter from rl_coach.filters.reward import RewardRescaleFilter from rl_coach.graph_managers.batch_rl_graph_manager import BatchRLGraphManager from rl_coach.graph_managers.graph_manager import ScheduleParameters from rl_coach.memories.memory import MemoryGranularity from rl_coach.schedules import LinearSchedule from rl_coach.memories.episodic import EpisodicExperienceReplayParameters DATASET_SIZE = 40000 #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(10000000000) schedule_params.steps_between_evaluation_periods = TrainingSteps(1) schedule_params.evaluation_steps = EnvironmentEpisodes(10) schedule_params.heatup_steps = EnvironmentSteps(DATASET_SIZE) ######### # Agent # ######### # TODO add a preset which uses a dataset to train a BatchRL graph. e.g. save a cartpole dataset in a csv format. agent_params = DDQNAgentParameters() agent_params.network_wrappers['main'].batch_size = 128 # DQN params # agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(100) # For making this become Fitted Q-Iteration we can keep the targets constant for the entire dataset size -
attention_weights = tf.nn.softmax(V(score), axis=1) context_vector = attention_weights * conv context_vector = tf.reduce_sum(context_vector, axis=1) return context_vector def __str__(self): return "Convolution (num filters = {}, kernel size = {}, stride = {})"\ .format(self.num_filters, self.kernel_size, self.strides) #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(10000000) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = ClippedPPOAgentParameters() agent_params.network_wrappers['main'].learning_rate = 0.0003 agent_params.network_wrappers['main'].input_embedders_parameters[ 'observation'].scheme = [ Conv2d(32, 8, 4), Conv2d(64, 4, 2), Conv2dWithAttention(64, 3, 1, 256)
def get_graph_manager(**hp_dict): #################### # All Default Parameters # #################### params = {} params["batch_size"] = int(hp_dict.get("batch_size", 64)) params["num_epochs"] = int(hp_dict.get("num_epochs", 10)) params["stack_size"] = int(hp_dict.get("stack_size", 1)) params["lr"] = float(hp_dict.get("lr", 0.0003)) params["lr_decay_rate"] = float(hp_dict.get("lr_decay_rate", 0)) params["lr_decay_steps"] = float(hp_dict.get("lr_decay_steps", 0)) params["exploration_type"] = (hp_dict.get("exploration_type", "categorical")).lower() params["e_greedy_value"] = float(hp_dict.get("e_greedy_value", .05)) params["epsilon_steps"] = int(hp_dict.get("epsilon_steps", 10000)) params["beta_entropy"] = float(hp_dict.get("beta_entropy", .01)) params["discount_factor"] = float(hp_dict.get("discount_factor", .999)) params["loss_type"] = hp_dict.get("loss_type", "Mean squared error").lower() params["num_episodes_between_training"] = int(hp_dict.get("num_episodes_between_training", 20)) params["term_cond_max_episodes"] = int(hp_dict.get("term_cond_max_episodes", 100000)) params["term_cond_avg_score"] = float(hp_dict.get("term_cond_avg_score", 100000)) params["tensorboard"] = hp_dict.get("tensorboard", False) params["dump_mp4"] = hp_dict.get("dump_mp4", False) params["dump_gifs"] = hp_dict.get("dump_gifs", False) params_json = json.dumps(params, indent=2, sort_keys=True) print("Using the following hyper-parameters", params_json, sep='\n') #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(params["term_cond_max_episodes"]) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = ClippedPPOAgentParameters() agent_params.network_wrappers['main'].learning_rate = params["lr"] agent_params.network_wrappers['main'].learning_rate_decay_rate = params["lr_decay_rate"] agent_params.network_wrappers['main'].learning_rate_decay_steps = params["lr_decay_steps"] agent_params.network_wrappers['main'].input_embedders_parameters['observation'].activation_function = 'relu' # Replace the default CNN with single layer Conv2d(32, 3, 1) # agent_params.network_wrappers['main'].input_embedders_parameters['observation'].scheme = EmbedderScheme.Shallow # agent_params.network_wrappers['main'].input_embedders_parameters['observation'].dropout_rate = 0.3 agent_params.network_wrappers['main'].middleware_parameters.activation_function = 'relu' # agent_params.network_wrappers['main'].middleware_parameters.scheme = MiddlewareScheme.Shallow # agent_params.network_wrappers['main'].middleware_parameters.dropout_rate = 0.3 agent_params.network_wrappers['main'].batch_size = params["batch_size"] agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5 agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999 # agent_params.network_wrappers['main'].l2_regularization = 2e-5 if params["loss_type"] == "huber": agent_params.network_wrappers['main'].replace_mse_with_huber_loss = True agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2 agent_params.algorithm.clipping_decay_schedule = LinearSchedule(1.0, 0, 1000000) agent_params.algorithm.beta_entropy = params["beta_entropy"] agent_params.algorithm.gae_lambda = 0.95 agent_params.algorithm.discount = params["discount_factor"] agent_params.algorithm.optimization_epochs = params["num_epochs"] agent_params.algorithm.estimate_state_value_using_gae = True agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentEpisodes( params["num_episodes_between_training"]) agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(params["num_episodes_between_training"]) agent_params.algorithm.distributed_coach_synchronization_type = DistributedCoachSynchronizationType.SYNC if params["exploration_type"] == "categorical": agent_params.exploration = CategoricalParameters() else: agent_params.exploration = EGreedyParameters() agent_params.exploration.epsilon_schedule = LinearSchedule(1.0, params["e_greedy_value"], params["epsilon_steps"]) ############### # Environment # ############### DeepRacerInputFilter = InputFilter(is_a_reference_filter=True) # Add an observation image pertubation for many aspects # DeepRacerInputFilter.add_observation_filter('observation', 'perturb_color', ObservationColorPerturbation(0.2)) # Rescale to much smaller input when using shallow networks to avoid OOM # DeepRacerInputFilter.add_observation_filter('observation', 'rescaling', # ObservationRescaleToSizeFilter(ImageObservationSpace(np.array([84, 84, 3]), # high=255))) DeepRacerInputFilter.add_observation_filter('observation', 'to_grayscale', ObservationRGBToYFilter()) DeepRacerInputFilter.add_observation_filter('observation', 'to_uint8', ObservationToUInt8Filter(0, 255)) DeepRacerInputFilter.add_observation_filter('observation', 'stacking', ObservationStackingFilter(params["stack_size"])) env_params = GymVectorEnvironment() env_params.default_input_filter = DeepRacerInputFilter env_params.level = 'DeepRacerRacetrackCustomActionSpaceEnv-v0' vis_params = VisualizationParameters() vis_params.tensorboard = params["tensorboard"] vis_params.dump_mp4 = params["dump_mp4"] vis_params.dump_gifs = params["dump_gifs"] # AlwaysDumpFilter, MaxDumpFilter, EveryNEpisodesDumpFilter, SelectedPhaseOnlyDumpFilter vis_params.video_dump_filters = [AlwaysDumpFilter()] ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 400 preset_validation_params.max_episodes_to_achieve_reward = 10000 graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params, schedule_params=schedule_params, vis_params=vis_params, preset_validation_params=preset_validation_params) return graph_manager, params_json
from rl_coach.environments.gym_environment import GymVectorEnvironment from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager from rl_coach.graph_managers.graph_manager import ScheduleParameters from rl_coach.schedules import LinearSchedule from rl_coach.exploration_policies.categorical import CategoricalParameters from rl_coach.filters.filter import NoInputFilter, NoOutputFilter, InputFilter from rl_coach.filters.observation.observation_stacking_filter import ObservationStackingFilter from rl_coach.filters.observation.observation_to_uint8_filter import ObservationToUInt8Filter from rl_coach.memories.memory import MemoryGranularity from markov import environments #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(1) #Changing to 100K schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = ClippedPPOAgentParameters() agent_params.network_wrappers['main'].learning_rate = 0.0003 agent_params.network_wrappers['main'].input_embedders_parameters[ 'observation'].activation_function = 'relu' agent_params.network_wrappers['main'].input_embedders_parameters[ 'observation'].batchnorm = True agent_params.network_wrappers['main'].input_embedders_parameters[
def solve(self, model, app_args, amc_cfg, services, steps_per_episode): msglogger.info("AMC: Using coach") # When we import the graph_manager from the ADC_DDPG preset, we implicitly instruct # Coach to create and use our DistillerWrapperEnvironment environment. # So Distiller calls Coach, which creates the environment, trains the agent, and ends. if amc_cfg.agent_algo == "DDPG": from examples.auto_compression.amc.rl_libs.coach.presets.ADC_DDPG import ( graph_manager, agent_params) graph_manager.agent_params.exploration.noise_schedule = ExponentialSchedule( amc_cfg.ddpg_cfg.initial_training_noise, 0, amc_cfg.ddpg_cfg.training_noise_decay) # Number of iterations to train graph_manager.agent_params.algorithm.num_consecutive_training_steps = steps_per_episode #graph_manager.agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(1) graph_manager.agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps( steps_per_episode) # Heatup graph_manager.heatup_steps = EnvironmentEpisodes( amc_cfg.ddpg_cfg.num_heatup_episodes) # Replay buffer size graph_manager.agent_params.memory.max_size = ( MemoryGranularity.Transitions, amc_cfg.ddpg_cfg.replay_buffer_size) amc_cfg.ddpg_cfg.training_noise_decay = amc_cfg.ddpg_cfg.training_noise_decay**( 1. / steps_per_episode) elif "ClippedPPO" in amc_cfg.agent_algo: from examples.auto_compression.amc.rl_libs.coach.presets.ADC_ClippedPPO import graph_manager, agent_params elif "TD3" in amc_cfg.agent_algo: from examples.auto_compression.amc.rl_libs.coach.presets.ADC_TD3 import graph_manager, agent_params else: raise ValueError( "The agent algorithm you are trying to use (%s) is not supported" % amc_cfg.amc_agent_algo) # Number of training steps n_training_episodes = amc_cfg.ddpg_cfg.num_training_episodes graph_manager.improve_steps = EnvironmentEpisodes(n_training_episodes) # Don't evaluate until the end graph_manager.steps_between_evaluation_periods = EnvironmentEpisodes( n_training_episodes) # These parameters are passed to the Distiller environment env_cfg = { 'model': model, 'app_args': app_args, 'amc_cfg': amc_cfg, 'services': services } graph_manager.env_params.additional_simulator_parameters = env_cfg coach_logs_dir = os.path.join(msglogger.logdir, 'coach') os.mkdir(coach_logs_dir) task_parameters = TaskParameters(experiment_path=coach_logs_dir) # Set Coach's PRNG seed if app_args.seed is not None: task_parameters.seed = app_args.seed graph_manager.create_graph(task_parameters) graph_manager.improve()
# Agent # ######### agent_params = DQNAgentParameters() agent_params.network_wrappers['main'].learning_rate = 0.001 agent_params.network_wrappers['main'].batch_size = 128 agent_params.network_wrappers['main'].middleware_parameters.scheme = [ Dense([256]) ] agent_params.network_wrappers['main'].input_embedders_parameters = { 'state': InputEmbedderParameters(scheme=EmbedderScheme.Empty), 'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty) } agent_params.algorithm.discount = 0.98 agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(16) agent_params.algorithm.num_consecutive_training_steps = 40 agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps( 40) agent_params.algorithm.rate_for_copying_weights_to_target = 0.05 agent_params.memory.max_size = (MemoryGranularity.Transitions, 10**6) agent_params.exploration.epsilon_schedule = ConstantSchedule(0.2) agent_params.exploration.evaluation_epsilon = 0 agent_params.memory = EpisodicHindsightExperienceReplayParameters() agent_params.memory.hindsight_goal_selection_method = HindsightGoalSelectionMethod.Final agent_params.memory.hindsight_transitions_per_regular_transition = 1 agent_params.memory.goals_space = GoalsSpace( goal_name='state', reward_type=ReachingGoal(distance_from_goal_threshold=0, goal_reaching_reward=0, default_reward=-1), distance_metric=GoalsSpace.DistanceMetric.Euclidean)
def train_using_experience_agent(env_params, n_epochs, dataset_size): tf.reset_default_graph( ) # just to clean things up; only needed for the tutorial # Experience Generating Agent parameters experience_generating_agent_params = DDQNAgentParameters() # schedule parameters experience_generating_schedule_params = ScheduleParameters() experience_generating_schedule_params.heatup_steps = EnvironmentSteps(1000) experience_generating_schedule_params.improve_steps = TrainingSteps( dataset_size - experience_generating_schedule_params.heatup_steps.num_steps) experience_generating_schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes( 10) experience_generating_schedule_params.evaluation_steps = EnvironmentEpisodes( 1) # DQN params experience_generating_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps( 100) experience_generating_agent_params.algorithm.discount = 0.99 experience_generating_agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps( 1) # NN configuration experience_generating_agent_params.network_wrappers[ 'main'].learning_rate = 0.0001 experience_generating_agent_params.network_wrappers[ 'main'].batch_size = 128 experience_generating_agent_params.network_wrappers[ 'main'].replace_mse_with_huber_loss = False experience_generating_agent_params.network_wrappers['main'].heads_parameters = \ [QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))] # experience_generating_agent_params.network_wrappers['main'].heads_parameters = \ # [QHeadParameters(output_bias_initializer=tf.constant_initializer(0))] # ER size experience_generating_agent_params.memory = EpisodicExperienceReplayParameters( ) experience_generating_agent_params.memory.max_size = \ (MemoryGranularity.Transitions, experience_generating_schedule_params.heatup_steps.num_steps + experience_generating_schedule_params.improve_steps.num_steps) # E-Greedy schedule experience_generating_agent_params.exploration.epsilon_schedule = LinearSchedule( 1.0, 0.01, DATASET_SIZE) experience_generating_agent_params.exploration.evaluation_epsilon = 0 schedule_params = set_schedule_params(n_epochs, dataset_size) # set the agent params as before # agent_params = set_agent_params(DDQNAgentParameters) agent_params = set_agent_params(DDQNBCQAgentParameters) agent_params.algorithm.action_drop_method_parameters = NNImitationModelParameters( ) # 50 epochs of training (the entire dataset is used each epoch) # schedule_params.improve_steps = TrainingSteps(50) graph_manager = BatchRLGraphManager( agent_params=agent_params, experience_generating_agent_params=experience_generating_agent_params, experience_generating_schedule_params= experience_generating_schedule_params, env_params=env_params, schedule_params=schedule_params, vis_params=VisualizationParameters( dump_signals_to_csv_every_x_episodes=1), reward_model_num_epochs=30, train_to_eval_ratio=0.5) graph_manager.create_graph(task_parameters) graph_manager.improve() return
from rl_coach.environments.gym_environment import GymVectorEnvironment from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager from rl_coach.graph_managers.graph_manager import ScheduleParameters ############### # Environment # ############### import gym_guess_number # pylint: disable=unused-import env_params = GymVectorEnvironment(level='GuessNumber-v0') #################### # Graph Scheduling # #################### training_steps = 200000 schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(training_steps) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(100) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ###################### # Agent - ClippedPPO # ###################### agent_params = ClippedPPOAgentParameters() agent_params.network_wrappers['main'].learning_rate = 0.001 agent_params.network_wrappers['main'].input_embedders_parameters[ 'observation'].activation_function = 'relu' agent_params.network_wrappers['main'].input_embedders_parameters[ 'observation'].scheme = [Dense(64)]
def get_graph_manager(hp_dict, agent_list, run_phase_subject, enable_domain_randomization=False, done_condition=any, run_type=str(RunType.ROLLOUT_WORKER), pause_physics=None, unpause_physics=None): #################### # Hyperparameters # #################### # Note: The following three line hard-coded to pick the first agent's trainig algorithm # and dump the hyper parameters for the particular training algorithm into json # for training jobs (so that the console display the training hyperparameters correctly) # since right now, we only support training one model at a time. # TODO: clean these lines up when we support multi-agent training. training_algorithm = agent_list[ 0].ctrl.model_metadata.training_algorithm if agent_list else None params = get_updated_hyper_parameters(hp_dict, training_algorithm) params_json = json.dumps(params, indent=2, sort_keys=True) print("Using the following hyper-parameters", params_json, sep='\n') #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps( params[HyperParameterKeys.TERMINATION_CONDITION_MAX_EPISODES.value]) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### trainable_agents_list = list() non_trainable_agents_list = list() for agent in agent_list: if agent.network_settings: training_algorithm = agent.ctrl.model_metadata.training_algorithm params = get_updated_hyper_parameters(hp_dict, training_algorithm) if TrainingAlgorithm.SAC.value == training_algorithm: agent_params = get_sac_params(DeepRacerSACAgentParams(), agent, params, run_type) else: agent_params = get_clipped_ppo_params( DeepRacerClippedPPOAgentParams(), agent, params) agent_params.env_agent = agent input_filter = InputFilter(is_a_reference_filter=True) for observation in agent.network_settings['input_embedders'].keys( ): if observation == Input.LEFT_CAMERA.value or observation == Input.CAMERA.value or \ observation == Input.OBSERVATION.value: input_filter.add_observation_filter( observation, 'to_grayscale', ObservationRGBToYFilter()) input_filter.add_observation_filter( observation, 'to_uint8', ObservationToUInt8Filter(0, 255)) input_filter.add_observation_filter( observation, 'stacking', ObservationStackingFilter(1)) if observation == Input.STEREO.value: input_filter.add_observation_filter( observation, 'to_uint8', ObservationToUInt8Filter(0, 255)) if observation == Input.LIDAR.value: input_filter.add_observation_filter( observation, 'clipping', ObservationClippingFilter(0.15, 1.0)) if observation == Input.SECTOR_LIDAR.value: sector_binary_filter = ObservationSectorDiscretizeFilter( num_sectors=NUMBER_OF_LIDAR_SECTORS, num_values_per_sector=1, clipping_dist=SECTOR_LIDAR_CLIPPING_DIST) input_filter.add_observation_filter( observation, 'binary', sector_binary_filter) if observation == Input.DISCRETIZED_SECTOR_LIDAR.value: num_sectors = agent.ctrl.model_metadata.lidar_num_sectors num_values_per_sector = agent.ctrl.model_metadata.lidar_num_values_per_sector clipping_dist = agent.ctrl.model_metadata.lidar_clipping_dist sector_discretize_filter = ObservationSectorDiscretizeFilter( num_sectors=num_sectors, num_values_per_sector=num_values_per_sector, clipping_dist=clipping_dist) input_filter.add_observation_filter( observation, 'discrete', sector_discretize_filter) agent_params.input_filter = input_filter() trainable_agents_list.append(agent_params) else: non_trainable_agents_list.append(agent) ############### # Environment # ############### env_params = DeepRacerRacetrackEnvParameters() env_params.agents_params = trainable_agents_list env_params.non_trainable_agents = non_trainable_agents_list env_params.level = 'DeepRacerRacetrackEnv-v0' env_params.run_phase_subject = run_phase_subject env_params.enable_domain_randomization = enable_domain_randomization env_params.done_condition = done_condition env_params.pause_physics = pause_physics env_params.unpause_physics = unpause_physics vis_params = VisualizationParameters() vis_params.dump_mp4 = False ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 400 preset_validation_params.max_episodes_to_achieve_reward = 10000 graph_manager = MultiAgentGraphManager( agents_params=trainable_agents_list, env_params=env_params, schedule_params=schedule_params, vis_params=vis_params, preset_validation_params=preset_validation_params, done_condition=done_condition) return graph_manager, params_json