def run_experiment(env_str, trials=10, max_steps=2000, render=False): successes = [] for trial in range(trials): print("--------------------------") env = gym.make(env_str) env._max_episode_steps = max_steps env.seed(int(time.time())) # seed environment prng.seed(int(time.time())) # seed action space reward, steps, success = control_lqr_finite_differences( env, max_steps, render) print("Reward = {}".format(reward)) successes.append(success) env.close() # Make sure rendering window is shut. time.sleep(1.0) pct_success = successes.count(True) / float(len(successes)) pct_failure = successes.count(False) / float(len(successes)) print("********************") print(successes) print("% success: " + str(pct_success)) print("% failure: " + str(pct_failure))
def run_cartpole_experiment(M=20, k=100, max_steps=2000, trials=100, render=False): exp_steps = [] step_threshold = 100 for trial in range(trials): env = gym.make("CartPole-v1") env._max_episode_steps = max_steps env.seed(int(time.time())) # seed environment prng.seed(int(time.time())) # seed action space reward, steps, success = mpc.control_random_mpc_cartpole( env, M, k, render) exp_steps.append(steps) env.close() # Make sure rendering window is shut. time.sleep(1.0) successes = sum(s >= step_threshold for s in exp_steps) failures = len(exp_steps) - successes pct_success = successes / float(trials) pct_failure = failures / float(trials) print("********************") print(exp_steps) print("% success: " + str(pct_success)) print("% failure: " + str(pct_failure))
def run_pendulum_experiment(M=10, k=2000, max_steps=2000, trials=100, render=False): rewards = [] for trial in range(trials): env = gym.make("Pendulum-v0") env._max_episode_steps = max_steps env.seed(int(time.time())) # seed environment prng.seed(int(time.time())) # seed action space reward, steps, success = mpc.control_random_mpc_pendulum( env, M, k, render) rewards.append(reward) env.close() # Make sure rendering window is shut. time.sleep(1.0) print("********************") print(rewards)
def __init__(self, env, discount_factor=0.99, log_dir=None, seed=None, gae_lambda=0, reward_len=100): """ discount_factor : float Discount rewards by this factor """ # Discount factor assert discount_factor >= 0 and discount_factor <= 1 self._discount = discount_factor self.gae_lambda = 0 self.log_dir = log_dir self.reward_len = reward_len self.set_logger(log_dir, reward_len) if seed is not None: self.logger.logger.info('Seed: {}'.format(seed)) tf_utils.set_global_seeds(seed) env.seed(seed) prng.seed(seed) # any tensorflow models should be cached and serialized separately self.tf_object_attributes = set() self.unserializables = set(['logger', 'replay_buffer']) self._env = env self._env_id = '{}_gym{}'.format(env.spec.id, gym.__version__)
def run_mountaincar_discrete_experiment(M=20, k=2000, max_steps=2000, trials=100, render=False): successes = [] for trial in range(trials): env = gym.make("MountainCar-v0") env._max_episode_steps = max_steps env.seed(int(time.time())) # seed environment prng.seed(int(time.time())) # seed action space reward, steps, success = mpc.control_random_mpc_mountaincar_discrete( env, M, k, render) successes.append(success) env.close() # Make sure rendering window is shut. time.sleep(1.0) print(successes) pct_success = successes.count(True) / float(len(successes)) pct_failure = successes.count(False) / float(len(successes)) print("********************") print("% success: " + str(pct_success)) print("% failure: " + str(pct_failure))
def basic_segments_from_rand_rollout( env_id, make_env, n_desired_segments, clip_length_in_seconds, # These are only for use with multiprocessing seed=0, _verbose=True, _multiplier=1): """ Generate a list of path segments by doing random rollouts. No multiprocessing. """ segments = [] env = make_env(env_id) env.seed(seed) space_prng.seed(seed) segment_length = int(clip_length_in_seconds * env.fps) while len(segments) < n_desired_segments: path = do_rollout(env, random_action) # Calculate the number of segments to sample from the path # Such that the probability of sampling the same part twice is fairly low. segments_for_this_path = max( 1, int(0.25 * len(path["obs"]) / segment_length)) for _ in range(segments_for_this_path): segment = sample_segment_from_path(path, segment_length) if segment: segments.append(segment) if _verbose and len(segments) % 10 == 0 and len(segments) > 0: print("Collected %s/%s segments" % (len(segments) * _multiplier, n_desired_segments * _multiplier)) if _verbose: print("Successfully collected %s segments" % (len(segments) * _multiplier)) return segments
def __init__(self, env, gamma, lr, obs_dim, action_dim): super(AntEntropyPolicy, self).__init__() self.affine1 = nn.Linear(obs_dim, 128) self.middle = nn.Linear(128, 128) self.mu = nn.Linear(128, action_dim) self.sigma = nn.Linear(128, action_dim) torch.nn.init.xavier_uniform_(self.affine1.weight) torch.nn.init.xavier_uniform_(self.middle.weight) torch.nn.init.xavier_uniform_(self.mu.weight) torch.nn.init.xavier_uniform_(self.sigma.weight) self.saved_log_probs = [] self.rewards = [] self.optimizer = optim.Adam(self.parameters(), lr=lr) self.eps = np.finfo(np.float32).eps.item() self.env = env self.gamma = gamma self.obs_dim = obs_dim self.action_dim = action_dim self.env.env.set_state(ant_utils.qpos, ant_utils.qvel) self.init_state = np.array(self.env.env.state_vector()) self.env.seed(int(time.time())) # seed environment prng.seed(int(time.time())) # seed action space
def load_frozen_lake(): env = FL.FrozenLakeEnv() env.seed(0) prng.seed(10) np.random.seed(0) np.set_printoptions(precision=3) print(env.__doc__) env.demonstrate() return env
def main(): save = False # Suppress scientific notation. np.set_printoptions(suppress=True, edgeitems=100) # Make environment. env = gym.make(args.env) env.seed(int(time.time())) # seed environment prng.seed(int(time.time())) # seed action space # Set up logging to file TIME = datetime.now().strftime('%Y_%m_%d-%H-%M') LOG_DIR = 'logs-' + args.env + '/' if not os.path.exists(LOG_DIR): os.mkdir(LOG_DIR) FILE_NAME = 'test' + TIME logging.basicConfig(level=logging.DEBUG, format='%(message)s', datefmt='%m-%d %H:%M', filename=LOG_DIR + FILE_NAME + '.log', filemode='w') logger = logging.getLogger(args.env + '-curiosity.pt') MODEL_DIR = 'models-' + args.env + '/models_' + TIME + '/' if not os.path.exists(MODEL_DIR): os.makedirs(MODEL_DIR) # save metadata from the run. with open(MODEL_DIR + "metadata", "w") as metadata: metadata.write("args: %s\n" % args) metadata.write("num_states: %s\n" % str(utils.num_states)) metadata.write("state_bins: %s\n" % utils.state_bins) policies, running_avg_entropies, entropies, running_avg_ps, average_ps = collect_entropy_policies(env, args.epochs, args.T, MODEL_DIR, logger) plotting.generate_figures(args.env, MODEL_DIR, running_avg_entropies, entropies, running_avg_ps, average_ps) exploration_policy = average_policies(env, policies) if (args.collect_video): MODEL_DIR = '' # average_p = exploration_policy.execute(args.T, render=True, save_video_dir=MODEL_DIR+'videos/epoch_' + str(args.epochs) + '/') overall_avg_ent = scipy.stats.entropy(average_p.flatten()) # average_p = curiosity.execute_average_policy(env, policies, args.T, render=True) log_iteration('average', logger, average_p, []) print('*************') print(np.reshape(average_p, utils.space_dim)) print("overall_avg_ent = %f" % overall_avg_ent) env.close() print("DONE")
def __init__(self): self.env = gym.make("Pendulum-v0") self.env.seed(int(time.time())) # seed environment prng.seed(int(time.time())) # seed action space self.initial_state = self.env.reset() self.state_size = len(self.env.observation_space.sample()) self.action_size = 1 self.x_goal = np.array([np.sin(0), np.cos(0), 0])
def __init__(self): self.env = gym.make("MountainCarContinuous-v0") self.env.seed(int(time.time())) # seed environment prng.seed(int(time.time())) # seed action space self.initial_state = self.env.reset() print("initial_state: " + str(self.initial_state)) self.state_size = len(self.env.observation_space.sample()) self.action_size = 1 self.x_goal = np.array([0.50, 1]) self.cost = 0
def main(): # Suppress scientific notation. np.set_printoptions(suppress=True, edgeitems=100) # Make environment. env = gym.make("HalfCheetah-v2") env.seed(int(time.time())) # seed environment prng.seed(int(time.time())) # seed action space # Set up experiment variables. T = 10000 avg_runs = 10 policies = load_from_dir(args.models_dir) for t in range(1, len(policies)): avg_state_dict = collect.average_policies(policies[:t]) exploration_policy = CheetahEntropyPolicy(env, args.gamma) exploration_policy.load_state_dict(avg_state_dict) average_p = exploration_policy.execute(T) for i in range(avg_runs): average_p += exploration_policy.execute(T) average_p /= float(avg_runs) ent_average_p = scipy.stats.entropy(average_p.flatten()) print('---------------------') print("Average policies[:%d]" % t) # print(average_p) print(ent_average_p) # obtain average policy. average_policy_state_dict = collect.average_policies(policies) exploration_policy = CheetahEntropyPolicy(env, args.gamma) exploration_policy.load_state_dict(average_policy_state_dict) average_p = exploration_policy.execute(T) print('*************') print(np.reshape(average_p, utils.space_dim)) # Now, learn the actual reward structure based on environment rewards. # actual_policy = ExplorePolicy(env, utils.obs_dim, utils.action_dim, exploration_policy, args.lr, args.gamma, args.eps) # actual_policy.learn_policy(args.episodes, args.train_steps) # actual_policy.execute(T, render=True) # actual_policy.save() env.close()
def __init__(self, env, gamma, lr, obs_dim, action_dim): super(AntActorCritic, self).__init__() self.linear1 = nn.Linear(obs_dim, 200) self.lstm = nn.LSTMCell(200, 128) # Actor self.mu_linear = nn.Linear(128, action_dim) self.sigma_sq_linear = nn.Linear(128, action_dim) # Critic self.value_linear = nn.Linear(128, 1) # initialize weight self.apply(weights_init) self.mu_linear.weight.data = normalized_columns_initializer( self.mu_linear.weight.data, 0.01) self.sigma_sq_linear.weight.data = normalized_columns_initializer( self.sigma_sq_linear.weight.data, 0.01) self.mu_linear.bias.data.fill_(0) self.sigma_sq_linear.bias.data.fill_(0) self.value_linear.weight.data = normalized_columns_initializer( self.value_linear.weight.data, 1.0) self.value_linear.bias.data.fill_(0) self.lstm.bias_ih.data.fill_(0) self.lstm.bias_hh.data.fill_(0) self.train() self.saved_log_probs = [] self.rewards = [] self.values = [] self.entropies = [] self.optimizer = optim.Adam(self.parameters(), lr=lr) self.eps = np.finfo(np.float32).eps.item() self.env = env self.gamma = gamma self.tau = 1.00 self.obs_dim = obs_dim self.action_dim = action_dim self.env.env.set_state(ant_utils.qpos, ant_utils.qvel) self.init_state = np.array(self.env.env.state_vector()) self.env.seed(int(time.time())) # seed environment prng.seed(int(time.time())) # seed action space
def main(): # Suppress scientific notation. np.set_printoptions(suppress=True) # Make environment. env = gym.make(args.env) env.seed(int(time.time())) # seed environment prng.seed(int(time.time())) # seed action space # Set up experiment variables. T = 1000 avg_runs = 10 policies = load_from_dir(args.models_dir) times = [] entropies = [] x_dist_times = [] x_distributions = [] v_dist_times = [] v_distributions = [] for t in range(1, len(policies)): average_p, avg_entropy = average_p_and_entropy(policies[:t], avg_runs) print('---------------------') print("Average policies[:%d]" % t) print(average_p) print(avg_entropy) # obtain global average policy. exploration_policy = collect.average_policies(env, policies) average_p = exploration_policy.execute(T) print('*************') print(average_p) # actual_policy = ExplorePolicy(env, obs_dim, action_dim, exploration_policy, args.lr, args.gamma) # actual_policy.learn_policy(args.episodes, args.train_steps) # actual_policy.execute(T, render=True) # actual_policy.save() env.close()
def main(): # Suppress scientific notation. np.set_printoptions(suppress=True, edgeitems=100) # Make environment. env = gym.make(args.env) # TODO: limit acceleration (maybe also speed?) for Pendulum. if args.env == "Pendulum-v0": env.env.max_speed = 8 env.env.max_torque = 1 env.seed(int(time.time())) # seed environment prng.seed(int(time.time())) # seed action space TIME = datetime.now().strftime('%Y_%m_%d-%H-%M') MODEL_DIR = 'models-' + args.env + '/models_' + TIME + '/' if args.save_models: if not os.path.exists(MODEL_DIR): os.makedirs(MODEL_DIR) # save metadata from the run. with open(MODEL_DIR + "metadata", "w") as metadata: metadata.write("args: %s\n" % args) metadata.write("num_states: %s\n" % str(utils.num_states)) metadata.write("state_bins: %s\n" % utils.state_bins) plotting.FIG_DIR = 'figs/' + args.env + '/' plotting.model_time = 'models_' + TIME + '/' if not os.path.exists(plotting.FIG_DIR + plotting.model_time): os.makedirs(plotting.FIG_DIR + plotting.model_time) policies = collect_entropy_policies(env, args.epochs, args.T, MODEL_DIR) exploration_policy = average_policies(env, policies) # Final policy: # average_p, _, _ = curiosity.execute_average_policy(env, policies, args.T) # overall_avg_ent = scipy.stats.entropy(average_p.flatten()) # print('*************') # print(np.reshape(average_p, utils.space_dim)) # print("overall_avg_ent = %f" % overall_avg_ent) env.close() print("DONE")
def run_environment_episode(env, pi, seed, model_file, max_timesteps, render, stochastic): number_of_timestep = 0 done = False # load model my_tf_util.load_state(model_file) # set seed set_global_seeds(seed) env.seed(seed) import gym.spaces.prng as prng prng.seed(seed) obs = env.reset() cum_reward = [] observations = [] distance = [] cum_rew_p = [] # max_timesteps is set to 1000 while (not done) and number_of_timestep < max_timesteps: action, _ = pi.act(stochastic, obs) obs, reward, done, info = env.step(action) observations.append(obs) cum_reward.append(reward) distance.append(info["distance_delta"]) cum_rew_p.append(info["rew_p"]) # render if render: env.render() number_of_timestep += 1 return observations, cum_reward, distance, cum_rew_p
def example(env): """Show an example of gym Parameters ---------- env: gym.core.Environment Environment to play on. Must have nS, nA, and P as attributes. """ env.seed(0); from gym.spaces import prng; prng.seed(10) # for print the location # Generate the episode ob = env.reset() for t in range(100): env.render() a = env.action_space.sample() ob, rew, done, _ = env.step(a) if done: break assert done env.render();
def main(): learning_rate = 0.1 epochs = 20 gamma = 1 horizon = 200 traj_len = 15 env = FrozenLakeEnvMultigoal(goal=2) env.seed(0) prng.seed(10) mdp1 = MDP(FrozenLakeEnvMultigoal(is_slippery=False, goal=1)) r1 = np.zeros(mdp1.nS) r1[-1] = 1 print('Reward used to generate expert trajectories: ', r1) policy1 = compute_policy(mdp1, gamma, r1, threshold=1e-8, horizon=horizon) trajectories1 = generate_trajectories(mdp1, policy1, traj_len, 200) print('Generated ', trajectories1.shape[0], ' traj of length ', traj_len) sa_visit_count, _ = compute_s_a_visitations(mdp1, gamma, trajectories1) print( 'Log likelihood of all traj under the policy generated', 'from the original reward: ', np.sum(sa_visit_count * np.log(policy1)), 'average per traj step: ', np.sum(sa_visit_count * np.log(policy1)) / (trajectories1.shape[0] * trajectories1.shape[1]), '\n') r = np.random.rand(mdp1.nS) print('Randomly initialized reward: ', r) r = max_causal_ent_irl(mdp1, gamma, trajectories1, epochs, learning_rate, r=r, horizon=horizon) print('Final reward: ', r)
def main(): # Suppress scientific notation. np.set_printoptions(suppress=True, edgeitems=100) # Make environment. env = gym.make(args.env) env.seed(int(time.time())) # seed environment prng.seed(int(time.time())) # seed action space # Set up saving models. # TIME = datetime.now().strftime('%Y_%m_%d-%H-%M') # MODEL_DIR = 'models-' + args.env + '/models_' + TIME + '/' # if not os.path.exists(MODEL_DIR): # os.makedirs(MODEL_DIR) # # save metadata from the run. # with open(MODEL_DIR + "metadata", "w") as metadata: # metadata.write("args: %s\n" % args) # metadata.write("num_states: %s\n" % str(ant_utils.num_states)) # metadata.write("state_bins: %s\n" % ant_utils.state_bins) policies = collect_entropy_policies(env, args.epochs, args.T) exploration_policy = average_policies(env, policies) if (args.collect_video): MODEL_DIR = '' # average_p = exploration_policy.execute(args.T, render=True, save_video_dir=MODEL_DIR+'videos/epoch_' + str(args.epochs) + '/') overall_avg_ent = scipy.stats.entropy(average_p.flatten()) # average_p = curiosity.execute_average_policy(env, policies, args.T, render=True) print('*************') # print(np.reshape(average_p, ant_utils.space_dim)) print("overall_avg_ent = %f" % overall_avg_ent) env.close() print("DONE")
['right', 'A', 'B'], ] self.children = [] def AddChild(self, randomAction) children.append(randomAction) self.unvisitedActions.remove() def selectRandomAction(self, randomValue) #not random right now unvisitedActions[] state = env.reset() #use same seed to see same outcomes prng.seed(1337) # FIRST STEP OCCURED REGARDLESS state, reward, done, info = env.step(env.action_space.sample()) #env.render() #save the inital life number lifeNum = info["life"] # always supposed to be 3 # check if the level has been completed while(info['flag_get'] == False): state, reward, done, info = env.step(env.action_space.sample()) # render the action/frame that occured #env.render() print(info["life"])
env.reset() # reset environment to a new, random state env.render() print("Action Space {}".format(env.action_space)) print("State Space {}".format(env.observation_space)) state = env.encode(3, 1, 2, 0) # (taxi row, taxi column, passenger index, destination index) print("State:", state) env.s = state env.render() """ """ # this is required to generate different starting positions prng.seed(1337) env.s = 328 # set environment to illustration's state epochs = 0 penalties, reward = 0, 0 frames = [] # for animation done = False while not done: action = env.action_space.sample() state, reward, done, info = env.step(action)
print(env.__doc__) print("") ################################# # Some basic imports and setup # Let's look at what a random episode looks like. import numpy as np, numpy.random as nr, gym import matplotlib.pyplot as plt #%matplotlib inline np.set_printoptions(precision=3) # Seed RNGs so you get the same printouts as me env.seed(0) from gym.spaces import prng prng.seed(10) # Generate the episode env.reset() for t in range(100): env.render() a = env.action_space.sample() ob, rew, done, _ = env.step(a) if done: break assert done env.render() ################################# # Create MDP for our env # We extract the relevant information from the gym Env into the MDP class below. # The `env` object won't be used any further, we'll just use the `mdp` object.
def _seed(self, seed=None): super(KukaPoseEnv, self)._seed(seed) prng.seed(seed)
env = gym.make('FrozenLake-v0') env = env.env print(env.__doc__) print("") ################################# # Some basic imports and setup # Let's look at what a random episode looks like. import numpy as np, numpy.random as nr, gym import matplotlib.pyplot as plt #%matplotlib inline np.set_printoptions(precision=3) # Seed RNGs so you get the same printouts as me env.seed(0); from gym.spaces import prng; prng.seed(10) # Generate the episode env.reset() for t in range(100): env.render() a = env.action_space.sample() ob, rew, done, _ = env.step(a) if done: break assert done env.render(); ################################# # Create MDP for our env # We extract the relevant information from the gym Env into the MDP class below. # The `env` object won't be used any further, we'll just use the `mdp` object.
def run(seed_num, seed_path, run_path, run_type, train=True): env = make_atari(ENV_NAME) env = wrap_deepmind(env, frame_stack=True, scale=False) env.seed(seed_num) seed(seed_num) dqn_agent = DDQN(env, run_path=run_path) dqn_agent.train_target_network() obs = env.reset().__array__(dtype=np.uint8) if train: state = load_state(run_path) if state is not None: ep = state['ep'] t_start = state['t'] ep_steps = state['ep_steps'] # ep_score = state['score'] std_dev_score = state['std_dev_score'] dqn_agent.epsilon = state['epsilon'] avg_score = state['avg_score'] best_avg_score = state['best_avg_score'] replay_fill_size = state['replay_fill_size'] else: ep = 0 t_start = 0 ep_steps = 0 replay_fill_size = 0 # ep_score = 0 std_dev_score = 0 avg_score = -21 best_avg_score = -21 plt_data = load_plot_data(run_path) if plt_data is not None: avg_score_vals = plt_data['avg_score_vals'] epsilon_vals = plt_data['epsilon_vals'] best_avg_score_vals = plt_data['best_avg_score_vals'] std_dev_score_vals = plt_data['std_dev_score_vals'] replay_fill_size_vals = plt_data['replay_fill_size_vals'] score_window = plt_data['score_window'] else: avg_score_vals = deque() epsilon_vals = deque() # epsilon_vals.append(dqn_agent.epsilon) best_avg_score_vals = deque() std_dev_score_vals = deque() replay_fill_size_vals = deque() score_window = deque(maxlen=args.score_window_size) avg_score_vals.append(avg_score) epsilon_vals.append(dqn_agent.epsilon) best_avg_score_vals.append(best_avg_score) std_dev_score_vals.append(std_dev_score) replay_fill_size_vals.append(replay_fill_size) ep_score = 0 max_score = -21 min_score = 21 print('avg_score_vals: {}'.format(avg_score_vals)) print('avg_score_vals_size: {}'.format(len(avg_score_vals))) print('std_dev_score_vals: {}'.format(std_dev_score_vals)) print('std_dev_score_vals size: {}'.format(len(std_dev_score_vals))) print('best_avg_score_vals: {}'.format(best_avg_score_vals)) print('best_avg_score_vals size: {}'.format(len(best_avg_score_vals))) print('replay_fill_size_vals: {}'.format(replay_fill_size_vals)) print('replay_fill_size_vals size: {}'.format( len(replay_fill_size_vals))) print('epsilon_vals: {}'.format(epsilon_vals)) print('epsilon_vals size: {}'.format(len(epsilon_vals))) print('score_window: {}'.format(score_window)) print('score_window size: {}'.format(len(score_window))) for t in range(t_start, TIMESTEPS): action = dqn_agent.choose_action(obs) dqn_agent.update_epsilon(ep, rt=run_type) new_obs, rew, done, _ = env.step(action) new_obs = new_obs.__array__(dtype=np.uint8) dqn_agent.remember(obs, action, rew, new_obs, done) obs = new_obs ep_score += rew ep_steps += 1 if done: obs = env.reset().__array__(dtype=np.uint8) score_window.append(ep_score) ep += 1 print('Episode {} | Timestep {} -> Score: {}'.format( ep, t, ep_score)) avg_score = round(np.mean(score_window), 1) if avg_score > best_avg_score and t > dqn_agent.learn_start and ep % args.save_frequency == 0: best_avg_score = avg_score dqn_agent.save_mdl() std_dev_score = round(np.std(score_window), 1) avg_score_vals.append(avg_score) epsilon_vals.append(dqn_agent.epsilon) best_avg_score_vals.append(best_avg_score) std_dev_score_vals.append(std_dev_score) replay_fill_size_vals.append( dqn_agent.replay_buffer.meta_data['fill_size']) print('Size of avg_score_vals buffer: {}'.format( len(avg_score_vals))) if ep % args.log_frequency == 0: print('Avg score: {}'.format(avg_score)) print('Time spent exploring: {} %'.format( round( 100 * dqn_agent.exploration.low_damp_value( ep, wave_offset=args.wave_offset, anneal_factor=args.anneal_factor * args.ep_lim, damp_freq_factor=args.damp_freq_factor * args.ep_lim), 2))) print('Std dev of score: {}'.format(std_dev_score)) # print('Max score: {}'.format(max_score)) # print('Min score: {}'.format(min_score)) print('ReplayBuffer fill_size: {}'.format( dqn_agent.replay_buffer.meta_data['fill_size'])) print('Score window contents: {}'.format( np.array(score_window))) if ep > 0 and ep % args.plot_frequency == 0: x_vals = np.arange(ep + 1) draw_plot(avg_score_plt, ax1, x_vals, avg_score_vals, 'Episodes', 'Avg Score', plot_name='Episodes vs Avg Score', plot_path=seed_path, rt=run_type) draw_plot(epsilon_vals_plt, ax2, x_vals, epsilon_vals, 'Episodes', 'Epsilon', plot_name='Episodes vs Epsilon', plot_path=seed_path, rt=run_type) draw_plot(std_dev_plt, ax3, x_vals, std_dev_score_vals, 'Episodes', 'Std Dev of Scores', plot_name='Episodes vs Std dev of score', plot_path=seed_path, rt=run_type) draw_plot(replay_fill_plt, ax4, x_vals, replay_fill_size_vals, 'Episodes', 'Replay buffer Fill Size', plot_name='Episodes vs Replay Buffer Fill size', plot_path=seed_path, rt=run_type) draw_plot(best_avg_score_plt, ax5, x_vals, best_avg_score_vals, 'Episodes', 'Best Avg Score', plot_name='Episodes vs Best Avg Score', plot_path=seed_path, rt=run_type) if ep % args.save_frequency == 0: plot_data = { 'avg_score_vals': avg_score_vals, 'epsilon_vals': epsilon_vals, 'best_avg_score_vals': best_avg_score_vals, 'std_dev_score_vals': std_dev_score_vals, 'replay_fill_size_vals': replay_fill_size_vals, 'score_window': score_window, } save_plot_data(plt_data=plot_data, run_path=run_path) state_data = { 'ep': ep, 't': t, 'ep_steps': ep_steps, 'score': ep_score, 'avg_score': avg_score, 'std_dev_score': std_dev_score, 'replay_fill_size': replay_fill_size, 'best_avg_score': best_avg_score, 'epsilon': dqn_agent.epsilon } save_state(st_data=state_data, run_path=run_path) dqn_agent.save() ep_score = 0 ep_steps = 0 if ep + 1 == EPISODES: # or best_avg_score >= args.target_score: print( '---------------------------just before finish-----------------------------------------' ) x_vals = np.arange(ep + 1) draw_plot(avg_score_plt, ax1, x_vals, avg_score_vals, 'Episodes', 'Avg Score', plot_name='Episodes vs Avg Score', plot_path=seed_path, rt=run_type, legend=True) draw_plot(epsilon_vals_plt, ax2, x_vals, epsilon_vals, 'Episodes', 'Epsilon', plot_name='Episodes vs Epsilon', plot_path=seed_path, rt=run_type, legend=True) draw_plot(std_dev_plt, ax3, x_vals, std_dev_score_vals, 'Episodes', 'Std Dev of Scores', plot_name='Episodes vs Std dev of score', plot_path=seed_path, rt=run_type, legend=True) draw_plot(replay_fill_plt, ax4, x_vals, replay_fill_size_vals, 'Episodes', 'Replay buffer Fill Size', plot_name='Episodes vs Replay Buffer Fill size', plot_path=seed_path, rt=run_type, legend=True) draw_plot(best_avg_score_plt, ax5, x_vals, best_avg_score_vals, 'Episodes', 'Best Avg Score', plot_name='Episodes vs Best Avg Score', plot_path=seed_path, rt=run_type, legend=True) break if t > dqn_agent.learn_start: if t % dqn_agent.train_freq == 0: dqn_agent.replay() if t % dqn_agent.train_targets == 0: dqn_agent.train_target_network()
def bMarioDead(currentLifeCount): global lifeNum if (lifeNum != currentLifeCount): return True else: return False state = env.reset() # =========== MAIN CODE ========================= #use same seed to see same outcomes SEED = 1337 prng.seed(SEED) random.seed(SEED) # FIRST STEP OCCURED REGARDLESS -----ROOT------ # root = Node() # #randomAction = env.action_space.sample() # state, reward, done, info = env.step(0) # print(info) # #print(randomAction) # #save the inital life number # lifeNum = info["life"] # currentChild = root.returnChild(0, info["x_pos"], bMarioDead(info["life"])) # env.render() lifeNum = 3 currentChild = Node(None, None, False, 0)
def env_thread(args, thread_num, partition=True, use_ppo2=False): """ Run a session of an environment :param args: (ArgumentParser object) :param thread_num: (int) The thread ID of the environment session :param partition: (bool) If the output should be in multiple parts (default=True) :param use_ppo2: (bool) Use ppo2 to generate the dataset """ env_kwargs = { "max_distance": args.max_distance, "random_target": args.random_target, "force_down": True, "is_discrete": not args.continuous_actions, "renders": thread_num == 0 and args.display, "record_data": not args.no_record_data, "multi_view": args.multi_view, "save_path": args.save_path, "shape_reward": args.shape_reward } if partition: env_kwargs["name"] = args.name + "_part-" + str(thread_num) else: env_kwargs["name"] = args.name env_class = registered_env[args.env][0] env = env_class(**env_kwargs) # Additional env when using a trained ppo agent to generate data # instead of a random agent train_env = env_class(**{**env_kwargs, "record_data": False, "renders": False}) train_env = DummyVecEnv([lambda: train_env]) train_env = VecNormalize(train_env, norm_obs=True, norm_reward=False) model = None if use_ppo2: model = PPO2(CnnPolicy, train_env).learn(args.ppo2_timesteps) frames = 0 start_time = time.time() # divide evenly, then do an extra one for only some of them in order to get the right count for i_episode in range(args.num_episode // args.num_cpu + 1 * (args.num_episode % args.num_cpu > thread_num)): # seed + position in this slice + size of slice (with reminder if uneven partitions) seed = args.seed + i_episode + args.num_episode // args.num_cpu * thread_num + \ (thread_num if thread_num <= args.num_episode % args.num_cpu else args.num_episode % args.num_cpu) env.seed(seed) prng.seed(seed) # this is for the sample() function from gym.space obs = env.reset() done = False t = 0 while not done: env.render() if use_ppo2: action, _ = model.predict([obs]) else: action = [env.action_space.sample()] _, _, done, _ = env.step(action[0]) frames += 1 t += 1 if done: print("Episode finished after {} timesteps".format(t + 1)) if thread_num == 0: print("{:.2f} FPS".format(frames * args.num_cpu / (time.time() - start_time)))
def seed(self, seed): prng.seed(seed) np.random.seed(seed) random.seed(seed)
def run(seed_num, train=True, run_type='base'): env = gym.make(ENV_NAME) #.env env._max_episode_steps = args.max_timesteps env.seed(seed_num) seed(seed_num) dqn_agent = DDQN(env) trial_seed_path = os.path.join(data_path, 'Seed ' + str(seed_num)) if not os.path.exists(trial_seed_path): os.makedirs(trial_seed_path) if train: max_score = -1000 min_score = 1000 avg_score = -1000 best_avg_score = -1000 epsilon_vals = deque(maxlen=EPISODES) avg_score_vals = deque(maxlen=EPISODES) best_avg_score_vals = deque(maxlen=EPISODES) # max_score_vals = deque(maxlen=EPISODES) # min_score_vals = deque(maxlen=EPISODES) std_dev_score_vals = deque(maxlen=EPISODES) replay_fill_size_vals = deque(maxlen=EPISODES) score_window = deque(maxlen=100) for ep in range(EPISODES): curr_obs = env.reset() curr_obs = reshape_input(curr_obs) total_r = 0 if ep % 10 == 0: render = False #True epsilon_vals.append(dqn_agent.epsilon) while True: if render: env.render() action = dqn_agent.choose_action(curr_obs) next_obs, reward, done, info = env.step(action) next_obs = reshape_input(next_obs) total_r += reward if total_r % 1000 == 0: print('current reward for ep {}: reached {}'.format( ep, total_r)) dqn_agent.remember(curr_obs, action, reward, next_obs, done) curr_obs = next_obs dqn_agent.replay() dqn_agent.update_epsilon(ep, run_type) if done: if render: render = False env.close() if ep % dqn_agent.train_targets == 0: dqn_agent.train_target_network() break score_window.append(total_r) avg_score = np.mean(score_window) if avg_score > best_avg_score: best_avg_score = avg_score dqn_agent.save_mdl(trial_seed_path, run_type) std_dev_score = np.std(score_window) # max_score = max(total_r,max_score) # min_score = min(total_r,min_score) # max_score_vals.append(max_score) # min_score_vals.append(min_score) avg_score_vals.append(avg_score) best_avg_score_vals.append(best_avg_score) std_dev_score_vals.append(std_dev_score) replay_fill_size_vals.append( dqn_agent.replay_buffer.meta_data['fill_size']) print('Episode ', ep, ' -> Score: ', total_r) if ep % args.log_frequency == 0: print('Avg score: {}'.format(avg_score)) print('Std dev of score: {}'.format(std_dev_score)) print('Max score: {}'.format(max_score)) print('Min score: {}'.format(min_score)) print('ReplayBuffer fill_size: {}'.format( dqn_agent.replay_buffer.meta_data['fill_size'])) if ep > 0 and ep % args.plot_frequency == 0: if ep + 1 == EPISODES: ax1.lines[-1].set_label(run_type) ax2.lines[-1].set_label(run_type) ax3.lines[-1].set_label(run_type) ax4.lines[-1].set_label(run_type) ax5.lines[-1].set_label(run_type) ax1.legend() ax2.legend() ax3.legend() ax4.legend() ax5.legend() x_vals = np.arange(ep + 1) save_plot(avg_score_plt, ax1, x_vals, avg_score_vals, 'Episodes', 'Avg Score', plot_name='Episodes vs Avg Score', trial_seed_path=trial_seed_path, rt=run_type) save_plot(epsilon_vals_plt, ax2, x_vals, epsilon_vals, 'Episodes', 'Epsilon', plot_name='Episodes vs Epsilon', trial_seed_path=trial_seed_path, rt=run_type) save_plot(std_dev_plt, ax3, x_vals, std_dev_score_vals, 'Episodes', 'Std Dev of Scores', plot_name='Episodes vs Std dev of score', trial_seed_path=trial_seed_path, rt=run_type) save_plot(replay_fill_plt, ax4, x_vals, replay_fill_size_vals, 'Episodes', 'Replay buffer Fill Size', plot_name='Episodes vs Replay Buffer Fill size', trial_seed_path=trial_seed_path, rt=run_type) save_plot(best_avg_score_plt, ax5, x_vals, best_avg_score_vals, 'Episodes', 'Best Avg Score', plot_name='Episodes vs Best Avg Score', trial_seed_path=trial_seed_path, rt=run_type) save_plot_data(d=avg_score_vals, trial_seed_path=trial_seed_path, rt=run_type)
from rlkit.envs.gridcraft import REW_ARENA_64 from rlkit.envs.gridcraft.grid_env import GridEnv from rlkit.envs.gridcraft.grid_spec import * from rlkit.envs.gridcraft.mazes import MAZE_ANY_START1 import gym.spaces.prng as prng import numpy as np if __name__ == "__main__": prng.seed(2) maze_spec = \ spec_from_string("SOOOO#R#OO\\"+ "OSOOO#2##O\\" + "###OO#3O#O\\" + "OOOOO#OO#O\\" + "OOOOOOOOOO\\" ) #maze_spec = spec_from_sparse_locations(50, 50, {START: [(25,25)], REWARD: [(45,45)]}) # maze_spec = REW_ARENA_64 maze_spec = MAZE_ANY_START1 env = GridEnv(maze_spec, one_hot=True, add_eyes=True, coordinate_wise=True) s = env.reset() #env.render() obses = [] for t in range(10): a = env.action_space.sample() obs, r, done, infos = env.step(a, verbose=True)