def main(model_path, temperature): model_path = glob.glob(model_path)[0] predictor = DiscreteDqnTorchPredictor(torch.jit.load(model_path)) predictor.softmax_temperature = temperature env = OpenAIGymEnvironment(gymenv=ENV) avg_rewards, avg_discounted_rewards = env.run_ep_n_times( AVG_OVER_NUM_EPS, predictor, test=True ) logger.info( "Achieved an average reward score of {} over {} evaluations.".format( avg_rewards, AVG_OVER_NUM_EPS ) )
def test_open_ai_gym_generate_samples_multi_step(self): env = OpenAIGymEnvironment( "CartPole-v0", epsilon=1.0, # take random actions to collect training data softmax_policy=False, gamma=0.9, ) num_samples = 1000 num_steps = 5 samples = env.generate_random_samples( num_samples, use_continuous_action=True, epsilon=1.0, multi_steps=num_steps, include_shorter_samples_at_start=True, include_shorter_samples_at_end=True, ) self._check_samples(samples, num_samples, num_steps, True)
def run_gym( params: OpenAiGymParameters, score_bar, embed_rl_dataset: RLDataset, gym_env: Env, mdnrnn: MemoryNetwork, max_embed_seq_len: int, ): assert params.rl is not None rl_parameters = params.rl env_type = params.env model_type = params.model_type epsilon, epsilon_decay, minimum_epsilon = create_epsilon( offline_train=True, rl_parameters=rl_parameters, params=params) replay_buffer = OpenAIGymMemoryPool(params.max_replay_memory_size) for row in embed_rl_dataset.rows: replay_buffer.insert_into_memory(**row) assert replay_buffer.memory_buffer is not None state_mem = replay_buffer.memory_buffer.state state_min_value = torch.min(state_mem).item() state_max_value = torch.max(state_mem).item() state_embed_env = StateEmbedGymEnvironment(gym_env, mdnrnn, max_embed_seq_len, state_min_value, state_max_value) open_ai_env = OpenAIGymEnvironment( state_embed_env, epsilon, rl_parameters.softmax_policy, rl_parameters.gamma, epsilon_decay, minimum_epsilon, ) rl_trainer = create_trainer(params, open_ai_env) rl_predictor = create_predictor(rl_trainer, model_type, params.use_gpu, open_ai_env.action_dim) assert (params.run_details.max_steps is not None and params.run_details.offline_train_epochs is not None ), "Missing data required for offline training: {}".format( str(params.run_details)) return train_gym_offline_rl( gym_env=open_ai_env, replay_buffer=replay_buffer, model_type=model_type, trainer=rl_trainer, predictor=rl_predictor, test_run_name="{} offline rl state embed".format(env_type), score_bar=score_bar, max_steps=params.run_details.max_steps, avg_over_num_episodes=params.run_details.avg_over_num_episodes, offline_train_epochs=params.run_details.offline_train_epochs, num_batch_per_epoch=None, )
def mdnrnn_gym( params: OpenAiGymParameters, feature_importance: bool = False, feature_sensitivity: bool = False, save_embedding_to_path: Optional[str] = None, seed: Optional[int] = None, ): assert params.mdnrnn is not None use_gpu = params.use_gpu logger.info("Running gym with params") logger.info(params) env_type = params.env env = OpenAIGymEnvironment(env_type, epsilon=1.0, softmax_policy=False, gamma=0.99, random_seed=seed) # create test data once assert params.run_details.max_steps is not None test_replay_buffer = get_replay_buffer( params.run_details.num_test_episodes, params.run_details.seq_len, params.run_details.max_steps, env, ) test_batch = test_replay_buffer.sample_memories( test_replay_buffer.memory_size, use_gpu=use_gpu, batch_first=True) trainer = create_trainer(params, env, use_gpu) _, _, trainer = train_sgd( env, trainer, use_gpu, "{} test run".format(env_type), params.mdnrnn.minibatch_size, params.run_details, test_batch=test_batch, ) feature_importance_map, feature_sensitivity_map, dataset = None, None, None if feature_importance: feature_importance_map = calculate_feature_importance( env, trainer, use_gpu, params.run_details, test_batch=test_batch) if feature_sensitivity: feature_sensitivity_map = calculate_feature_sensitivity_by_actions( env, trainer, use_gpu, params.run_details, test_batch=test_batch) if save_embedding_to_path: dataset = RLDataset(save_embedding_to_path) create_embed_rl_dataset(env, trainer, dataset, use_gpu, params.run_details) dataset.save() return env, trainer, feature_importance_map, feature_sensitivity_map, dataset
def run_gym( params: OpenAiGymParameters, offline_train, score_bar, seed=None, save_timesteps_to_dataset=None, start_saving_from_score=None, path_to_pickled_transitions=None, warm_trainer=None, reward_shape_func=None, ): use_gpu = params.use_gpu logger.info("Running gym with params") logger.info(params) assert params.rl is not None rl_parameters = params.rl env_type = params.env model_type = params.model_type epsilon, epsilon_decay, minimum_epsilon = create_epsilon( offline_train, rl_parameters, params ) env = OpenAIGymEnvironment( env_type, epsilon, rl_parameters.softmax_policy, rl_parameters.gamma, epsilon_decay, minimum_epsilon, seed, ) replay_buffer = create_replay_buffer( env, params, model_type, offline_train, path_to_pickled_transitions ) trainer = warm_trainer if warm_trainer else create_trainer(params, env) predictor = create_predictor(trainer, model_type, use_gpu, env.action_dim) return train( env, offline_train, replay_buffer, model_type, trainer, predictor, "{} test run".format(env_type), score_bar, params.run_details, save_timesteps_to_dataset=save_timesteps_to_dataset, start_saving_from_score=start_saving_from_score, reward_shape_func=reward_shape_func, )
def multi_step_sample_generator( gym_env: OpenAIGymEnvironment, num_transitions: int, max_steps: Optional[int], multi_steps: int, include_shorter_samples_at_start: bool, include_shorter_samples_at_end: bool, ): """ Convert gym env multi-step sample format to mdn-rnn multi-step sample format :param gym_env: The environment used to generate multi-step samples :param num_transitions: # of samples to return :param max_steps: An episode terminates when the horizon is beyond max_steps :param multi_steps: # of steps of states and actions per sample :param include_shorter_samples_at_start: Whether to keep samples of shorter steps which are generated at the beginning of an episode :param include_shorter_samples_at_end: Whether to keep samples of shorter steps which are generated at the end of an episode """ samples = gym_env.generate_random_samples( num_transitions=num_transitions, use_continuous_action=True, max_step=max_steps, multi_steps=multi_steps, include_shorter_samples_at_start=include_shorter_samples_at_start, include_shorter_samples_at_end=include_shorter_samples_at_end, ) for j in range(num_transitions): sample_steps = len(samples.terminals[j]) # type: ignore state = dict_to_np(samples.states[j], np_size=gym_env.state_dim, key_offset=0) action = dict_to_np(samples.actions[j], np_size=gym_env.action_dim, key_offset=gym_env.state_dim) next_actions = np.float32( # type: ignore [ dict_to_np( samples.next_actions[j][k], np_size=gym_env.action_dim, key_offset=gym_env.state_dim, ) for k in range(sample_steps) ]) next_states = np.float32( # type: ignore [ dict_to_np(samples.next_states[j][k], np_size=gym_env.state_dim, key_offset=0) for k in range(sample_steps) ]) rewards = np.float32(samples.rewards[j]) # type: ignore terminals = np.float32(samples.terminals[j]) # type: ignore not_terminals = np.logical_not(terminals) ordered_states = np.vstack((state, next_states)) ordered_actions = np.vstack((action, next_actions)) mdnrnn_states = ordered_states[:-1] mdnrnn_actions = ordered_actions[:-1] mdnrnn_next_states = ordered_states[-multi_steps:] mdnrnn_next_actions = ordered_actions[-multi_steps:] # Padding zeros so that all samples have equal steps # The general rule is to pad zeros at the end of sequences. # In addition, if the sequence only has one step (i.e., the # first state of an episode), pad one zero row ahead of the # sequence, which enables embedding generated properly for # one-step samples num_padded_top_rows = 1 if multi_steps > 1 and sample_steps == 1 else 0 num_padded_bottom_rows = multi_steps - sample_steps - num_padded_top_rows sample_steps_next = len(mdnrnn_next_states) num_padded_top_rows_next = 0 num_padded_bottom_rows_next = multi_steps - sample_steps_next yield ( np.pad( mdnrnn_states, ((num_padded_top_rows, num_padded_bottom_rows), (0, 0)), "constant", constant_values=0.0, ), np.pad( mdnrnn_actions, ((num_padded_top_rows, num_padded_bottom_rows), (0, 0)), "constant", constant_values=0.0, ), np.pad( rewards, ((num_padded_top_rows, num_padded_bottom_rows)), "constant", constant_values=0.0, ), np.pad( mdnrnn_next_states, ((num_padded_top_rows_next, num_padded_bottom_rows_next), (0, 0)), "constant", constant_values=0.0, ), np.pad( mdnrnn_next_actions, ((num_padded_top_rows_next, num_padded_bottom_rows_next), (0, 0)), "constant", constant_values=0.0, ), np.pad( not_terminals, ((num_padded_top_rows, num_padded_bottom_rows)), "constant", constant_values=0.0, ), sample_steps, sample_steps_next, )
def train_gym_offline_rl( gym_env: OpenAIGymEnvironment, replay_buffer: OpenAIGymMemoryPool, model_type: str, trainer: RLTrainer, predictor: OnPolicyPredictor, test_run_name: str, score_bar: Optional[float], max_steps: int, avg_over_num_episodes: int, offline_train_epochs: int, num_batch_per_epoch: Optional[int], bcq_imitator_hyper_params: Optional[Dict[str, Any]] = None, ): if num_batch_per_epoch is None: num_batch_per_epoch = replay_buffer.size // trainer.minibatch_size assert num_batch_per_epoch > 0, "The size of replay buffer is not sufficient" logger.info( "{} offline transitions in replay buffer.\n" "Training will take {} epochs, with each epoch having {} mini-batches" " and each mini-batch having {} samples".format( replay_buffer.size, offline_train_epochs, num_batch_per_epoch, trainer.minibatch_size, )) avg_reward_history, epoch_history = [], [] # Pre-train a GBDT imitator if doing batch constrained q-learning in Gym if getattr(trainer, "bcq", None): assert bcq_imitator_hyper_params is not None gbdt = GradientBoostingClassifier( n_estimators=bcq_imitator_hyper_params["gbdt_trees"], max_depth=bcq_imitator_hyper_params["max_depth"], ) samples = replay_buffer.sample_memories(replay_buffer.size, model_type) X, y = samples.states.numpy(), torch.max(samples.actions, dim=1)[1].numpy() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1) logger.info("Fitting GBDT...") gbdt.fit(X_train, y_train) train_score = round(gbdt.score(X_train, y_train) * 100, 1) test_score = round(gbdt.score(X_test, y_test) * 100, 1) logger.info("GBDT train accuracy {}% || test accuracy {}%".format( train_score, test_score)) trainer.bcq_imitator = gbdt.predict_proba # type: ignore # Offline training for i_epoch in range(offline_train_epochs): for _ in range(num_batch_per_epoch): samples = replay_buffer.sample_memories(trainer.minibatch_size, model_type) samples.set_device(trainer.device) trainer.train(samples) batch_td_loss = float( torch.mean( torch.tensor([ stat.td_loss for stat in trainer.loss_reporter.incoming_stats ]))) trainer.loss_reporter.flush() logger.info("Average TD loss: {} in epoch {}".format( batch_td_loss, i_epoch + 1)) # test model performance for this epoch avg_rewards, avg_discounted_rewards = gym_env.run_ep_n_times( avg_over_num_episodes, predictor, test=True, max_steps=max_steps) avg_reward_history.append(avg_rewards) # For offline training, use epoch number as timestep history since # we have a fixed batch of data to count epochs over. epoch_history.append(i_epoch) logger.info( "Achieved an average reward score of {} over {} evaluations" " after epoch {}.".format(avg_rewards, avg_over_num_episodes, i_epoch)) if score_bar is not None and avg_rewards > score_bar: logger.info("Avg. reward history for {}: {}".format( test_run_name, avg_reward_history)) return avg_reward_history, epoch_history, trainer, predictor, gym_env logger.info("Avg. reward history for {}: {}".format( test_run_name, avg_reward_history)) return avg_reward_history, epoch_history, trainer, predictor, gym_env