model = DQN(dim, K, sizes, gamma) tmodel = DQN(dim, K, sizes, gamma) #Session init = tf.global_variables_initializer() session = tf.InteractiveSession() session.run(init) model.set_session(session) tmodel.set_session(session) #Directory to save results if not os.path.exists('Results'): os.mkdir('Results') os.chdir('Results') env = wrappers.Monitor(env, 'Policy_Gradient_Hill_Climbing_Result', force=True) N = 500 totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): eps = 1.0 / np.sqrt(n + 1) totalreward = play_one(env, model, tmodel, eps, gamma, copy_period) totalrewards[n] = totalreward if n % 100 == 0: print("episode:", n, "total reward:", totalreward, "eps:", eps, "avg reward (last 100):", totalrewards[max(0, n - 100):(n + 1)].mean()) print("avg reward for last 100 episodes:", totalrewards[-100:].mean())
def __init__(self, params): ############# ## INIT ############# # Get params, create logger, create TF session self.params = params self.logger = Logger(self.params['logdir']) self.sess = create_tf_session(self.params['use_gpu'], which_gpu=self.params['which_gpu']) # Set random seeds seed = self.params['seed'] tf.set_random_seed(seed) np.random.seed(seed) ############# ## ENV ############# # Make the gym environment self.env = gym.make(self.params['env_name']) if 'env_wrappers' in self.params: # These operations are currently only for Atari envs self.env = wrappers.Monitor(self.env, os.path.join(self.params['logdir'], "gym"), force=True) self.env = params['env_wrappers'](self.env) self.mean_episode_reward = -float('nan') self.best_mean_episode_reward = -float('inf') self.env.seed(seed) # import plotting (locally if 'obstacles' env) if not (self.params['env_name'] == 'obstacles-cs285-v0'): import matplotlib matplotlib.use('Agg') # Maximum length for episodes self.params['ep_len'] = self.params[ 'ep_len'] or self.env.spec.max_episode_steps global MAX_VIDEO_LEN MAX_VIDEO_LEN = self.params['ep_len'] # Is this env continuous, or self.discrete? discrete = isinstance(self.env.action_space, gym.spaces.Discrete) # Are the observations images? img = len(self.env.observation_space.shape) > 2 self.params['agent_params']['discrete'] = discrete # Observation and action sizes ob_dim = self.env.observation_space.shape if img else self.env.observation_space.shape[ 0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[ 0] self.params['agent_params']['ac_dim'] = ac_dim self.params['agent_params']['ob_dim'] = ob_dim # simulation timestep, will be used for video saving if 'model' in dir(self.env): self.fps = 1 / self.env.model.opt.timestep elif 'env_wrappers' in self.params: self.fps = 30 # This is not actually used when using the Monitor wrapper elif 'video.frames_per_second' in self.env.env.metadata.keys(): self.fps = self.env.env.metadata['video.frames_per_second'] else: self.fps = 10 ############# ## AGENT ############# agent_class = self.params['agent_class'] self.agent = agent_class(self.sess, self.env, self.params['agent_params']) ############# ## INIT VARS ############# tf.global_variables_initializer().run(session=self.sess)
import gym from gym import wrappers env = gym.make('CartPole-v0') env = wrappers.Monitor(env, './video') for i_episode in range(20): observation = env.reset() for t in range(100): # env.render() print(observation) action = env.action_space.sample() observation, reward, done, info = env.step(action) if done: print("Episode finished after {} timesteps".format(t + 1)) break
import gym import numpy as np from gym import wrappers import os env = gym.make('CartPole-v0') model_dir = './models/model_[50, 50, 50, 50]/1567524154.1553748' save_dir = model_dir + '/videos' if not os.path.exists(save_dir): os.makedirs(save_dir) env = wrappers.Monitor(env, save_dir, force=True) BUCKET_SIZE = [50, 50, 50, 50] LIMIT_VAR = [(env.observation_space.low[0], env.observation_space.high[0]), (-4, 4), (env.observation_space.low[2], env.observation_space.high[2]), (-5, 5)] WINDOW_SIZE = [] real_size = [(size + 2) if size != 1 else 1 for size in BUCKET_SIZE] Q_values = np.load( './models/model_[50, 50, 50, 50]/1567524154.1553748/model_at_100000_episodes_trained.npy' ) def chooseAction(discrete_state):
# import our training environment import old_way_moving_cube_env if __name__ == '__main__': rospy.init_node('movingcube_gym', anonymous=True, log_level=rospy.WARN) # Create the Gym environment env = gym.make('OldMovingCube-v0') rospy.loginfo("Gym environment done") # Set the logging system rospack = rospkg.RosPack() pkg_path = rospack.get_path('moving_cube_training_pkg') outdir = pkg_path + '/training_results' env = wrappers.Monitor(env, outdir, force=True) rospy.loginfo("Monitor Wrapper started") last_time_steps = numpy.ndarray(0) # Loads parameters from the ROS param server # Parameters are stored in a yaml file inside the config directory # They are loaded at runtime by the launch file Alpha = rospy.get_param("/moving_cube/alpha") Epsilon = rospy.get_param("/moving_cube/epsilon") Gamma = rospy.get_param("/moving_cube/gamma") epsilon_discount = rospy.get_param("/moving_cube/epsilon_discount") nepisodes = rospy.get_param("/moving_cube/nepisodes") nsteps = rospy.get_param("/moving_cube/nsteps") running_step = rospy.get_param("/moving_cube/running_step")
scores = {k:max(r_pos, r_neg) for k, (r_pos, r_neg) in enumerate(zip(positive_rewards, negative_rewards))} order = sorted(scores.keys(), key = lambda x:scores[x]) [0:hp.nb_best_directions] # we don't need to specify 0 (lower bound of range) rollouts = [(positive_rewards[k], negative_rewards[k], deltas(k)) for k in order] #Updating our policy policy.update(rollouts, sigma_r) #Printing the final reward of the policy after the update reward_evaluation = explore(env, normalizer, policy) print('Step: ', step, 'Reward: ', reward_evaluation) def mkdir(base, name): path = os.path.join(base, name) if not os.path.exists(path): os.makedirs(path) return path work_dir = mkdir('exp', 'brs') monitor_dir = mkdir(work_dir, 'monitor') hp = Hp() np.random.seed(hp.seed) env = gym.make(hp.env_name) env = wrappers.Monitor(env, monitor_dir, force = True) nb_inputs = env.observation_space.shape[0] nb_outputs = env.action_space.shape[0] policy = Policy(nb_inputs, nb_outputs) normalizer = Normalizer(nb_inputs) train(env, policy, normalizer, hp)
import gym from time import sleep from gym import wrappers env = gym.make('LunarLander-v2') env = wrappers.Monitor(env, './') env.seed(0) g = 1.0 delta_t = 1.0 / 50.0 action = 0 state = env.reset() y0 = state[1] v0 = 0 cut_off = 0.01 for t in range(3000): env.render() state, reward, done, _ = env.step(action) y = state[1] v = (y - y0) / delta_t if done or y < 0 or v == 0.001: break alt_burn = (y * g + 0.5 * v * v) / (13.0 / env.lander.mass * 0.5) v0 = v y0 = y if y < alt_burn and y > cut_off: action = 2
def rollout(agent, env_name, num_steps, num_episodes=0, saver=None, no_render=True, video_dir=None): policy_agent_mapping = default_policy_agent_mapping if saver is None: saver = RolloutSaver() if hasattr(agent, "workers") and isinstance(agent.workers, WorkerSet): env = agent.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"][ "policy_mapping_fn"] policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} else: env = gym.make(env_name) multiagent = False try: policy_map = {DEFAULT_POLICY_ID: agent.policy} except AttributeError: raise AttributeError( "Agent ({}) does not have a `policy` property! This is needed " "for performing (trained) agent rollouts.".format(agent)) use_lstm = {DEFAULT_POLICY_ID: False} action_init = { p: flatten_to_single_ndarray(m.action_space.sample()) for p, m in policy_map.items() } # If monitoring has been requested, manually wrap our environment with a # gym monitor, which is set to record every episode. if video_dir: env = gym_wrappers.Monitor(env=env, directory=video_dir, video_callable=lambda x: True, force=True) steps = 0 episodes = 0 while keep_going(steps, num_steps, episodes, num_episodes): mapping_cache = {} # in case policy_agent_mapping is stochastic saver.begin_rollout() obs = env.reset() agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_total = 0.0 while not done and keep_going(steps, num_steps, episodes, num_episodes): multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = agent.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) agent_states[agent_id] = p_state else: a_action = agent.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) a_action = flatten_to_single_ndarray(a_action) action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] next_obs, reward, done, info = env.step(action) if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward if multiagent: done = done["__all__"] reward_total += sum(reward.values()) else: reward_total += reward if not no_render: env.render() saver.append_step(obs, action, next_obs, reward, done, info) steps += 1 obs = next_obs saver.end_rollout() print("Episode #{}: reward: {}".format(episodes, reward_total)) if done: episodes += 1
import gym import numpy as np import tensorflow as tf from gym import wrappers #Deep Q-learning algorithm # 1. Do a feedforward pass for the current state s to get predicted Q-values for all actions. # 2. Do a feedforward pass for the next state s′ and calculate maximum over all network outputs maxa′Q(s′,a′). # 3. Set Q-value target for action a to r+γmaxa′Q(s′,a′) (use the max calculated in step 2). For all other actions, set the Q-value target to the same as originally returned from step 1, making the error 0 for those outputs. # 4. Update the weights using backpropagation. # creates the frozenlake environment env = gym.make('FrozenLake-v0') env = wrappers.Monitor(env, '/tmp/frozenlake-qlearning', force=True) n_obv = env.observation_space.n n_acts = env.action_space.n #neural network x = tf.placeholder(shape=[1, 16], dtype=tf.float32) y_ = tf.placeholder(shape=[1, 4], dtype=tf.float32) W = tf.Variable(tf.random_uniform([16, 4], 0, 0.1)) y = tf.matmul(x, W) action = tf.argmax(y, 1) cost = tf.reduce_sum(tf.square(y_ - y)) optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(cost) #tensorflow initialization sess = tf.InteractiveSession() tf.global_variables_initializer().run()
input_dims=(env.observation_space.shape), n_actions=(env.action_space.n), mem_size=50000, eps_min=0.1, batch_size=32, replace=1000, eps_dec=1e-5, checkpoint_dir='models/', algo='DQNAgent', env_name='PongNoFrameskip-v4') agent.load_models() print(agent.q_eval) env = wrappers.Monitor(env, "tmp/dqn-video", video_callable=lambda episode_id: True, force=True) n_steps = 0 score = 0 done = False obs = env.reset() while not done: action = agent.choose_action(obs) resulted_obs, reward, done, info = env.step(action) score += reward obs = resulted_obs n_steps += 1 print(n_steps)
def wrap_monitor(env, log_dir): env = wrappers.Monitor(env, log_dir, video_callable=lambda x: True) return env
gamma=args.gamma, epsilon=epsilon, epsilon_min=epsilon_min, epsilon_dec=epsilon_dec, memory_size=args.memory_size, batch_size=args.batch_size, replace=args.replace, checkpoint_dir=models_path, algo=agent_path, env_name=args.env_name) if load_checkpoint: agent.load_models() videos_path = os.path.join(videos_path, ) env = wrappers.Monitor(env, videos_path, video_callable=lambda episode_id: True, force=True) # force overwrites previous video # for saving plot # fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) + '_' + str(n_games) + 'games' if load_checkpoint: figure_file = os.path.join(plots_path, 'plot_eval.png') else: figure_file = os.path.join(plots_path, 'plot.png') n_steps = 0 # steps array is for plotting scores wrt steps, instead of games played # because games are highly variable, can be short or long games. Steps are steps. scores, eps_history, steps_array = [], [], [] start = time.time()
def main(): parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='Breakout-v0', help='Atari env name') parser.add_argument('-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') parser.add_argument('--mode', choices=['train', 'test'], default='train') parser.add_argument('--network', choices=['deep', 'linear'], default='deep') parser.add_argument('--method', choices=['dqn', 'double', 'dueling'], default='dqn') parser.add_argument('--monitor', type=bool, default=False) parser.add_argument('--iter', type=int, default=2400000) parser.add_argument('--test_policy', choices=['Greedy', 'GreedyEpsilon'], default='GreedyEpsilon') args = parser.parse_args() args.seed = np.random.randint(0, 1000000, 1)[0] args.weights = 'models/dqn_{}_weights_{}_{}_{}.h5f'.format( args.env, args.method, args.network, args.iter) args.monitor_path = 'tmp/dqn_{}_weights_{}_{}_{}_{}'.format( args.env, args.method, args.network, args.iter, args.test_policy) if args.mode == 'train': args.monitor = False env = gym.make(args.env) if args.monitor: env = wrappers.Monitor(env, args.monitor_path, force=True) np.random.seed(args.seed) env.seed(args.seed) args.gamma = 0.99 args.learning_rate = 0.0001 args.epsilon = 0.05 args.num_iterations = 5000000 args.batch_size = 32 args.window_length = 4 args.num_burn_in = 50000 args.target_update_freq = 10000 args.log_interval = 10000 args.model_checkpoint_interval = 10000 args.train_freq = 4 args.num_actions = env.action_space.n args.input_shape = (84, 84) args.memory_max_size = 1000000 args.output = get_output_folder(args.output, args.env) args.suffix = args.method + '_' + args.network if (args.method == 'dqn'): args.enable_double_dqn = False args.enable_dueling_network = False elif (args.method == 'double'): args.enable_double_dqn = True args.enable_dueling_network = False elif (args.method == 'dueling'): args.enable_double_dqn = False args.enable_dueling_network = True else: print('Attention! Method Worng!!!') if args.test_policy == 'Greedy': test_policy = GreedyPolicy() elif args.test_policy == 'GreedyEpsilon': test_policy = GreedyEpsilonPolicy(args.epsilon) print(args) K.tensorflow_backend.set_session(get_session()) model = create_model(args.window_length, args.input_shape, args.num_actions, args.network) # we create our preprocessor, the Ataripreprocessor will only process current frame the agent is seeing. And the sequence # preprocessor will construct the state by concatenating 3 previous frames from HistoryPreprocessor and current processed frame Processor = {} Processor['Atari'] = AtariPreprocessor(args.input_shape) Processor['History'] = HistoryPreprocessor(args.window_length) ProcessorSequence = PreprocessorSequence(Processor) # construct 84x84x4 # we create our memory for saving all experience collected during training with window length 4 memory = ReplayMemory(max_size=args.memory_max_size, input_shape=args.input_shape, window_length=args.window_length) # we use linear decay greedy epsilon policy and tune the epsilon from 1 to 0.1 during the first 100w iterations and then keep using # epsilon with 0.1 to further train the network # we construct our agent and use 0.99 as our discounted factor, 32 as our batch_size. We update our model for each 4 iterations. But during first # 50000 iterations, we only collect data to the memory and don't update our model. policy = LinearDecayGreedyEpsilonPolicy(GreedyEpsilonPolicy(args.epsilon), attr_name='eps', start_value=1, end_value=0.1, num_steps=1000000) dqn = DQNAgent(q_network=model, policy=policy, memory=memory, num_actions=args.num_actions, test_policy=test_policy, preprocessor=ProcessorSequence, gamma=args.gamma, target_update_freq=args.target_update_freq, num_burn_in=args.num_burn_in, train_freq=args.train_freq, batch_size=args.batch_size, enable_double_dqn=args.enable_double_dqn, enable_dueling_network=args.enable_dueling_network) adam = Adam(lr=args.learning_rate) dqn.compile(optimizer=adam) if args.mode == 'train': weights_filename = 'dqn_{}_weights_{}.h5f'.format( args.env, args.suffix) checkpoint_weights_filename = 'dqn_' + args.env + '_weights_' + args.suffix + '_{step}.h5f' log_filename = 'dqn_{}_log_{}.json'.format(args.env, args.suffix) log_dir = '../tensorboard_{}_log_{}'.format(args.env, args.suffix) callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=args.model_checkpoint_interval) ] callbacks += [FileLogger(log_filename, interval=100)] callbacks += [ TensorboardStepVisualization(log_dir=log_dir, histogram_freq=1, write_graph=True, write_images=True) ] # start training # we don't apply action repetition explicitly since the game will randomly skip frame itself dqn.fit(env, callbacks=callbacks, verbose=1, num_iterations=args.num_iterations, action_repetition=1, log_interval=args.log_interval, visualize=True) dqn.save_weights(weights_filename, overwrite=True) dqn.evaluate(env, num_episodes=10, visualize=True, num_burn_in=5, action_repetition=1) elif args.mode == 'test': weights_filename = 'dqn_{}_weights_{}.h5f'.format( args.env, args.suffix) if args.weights: weights_filename = args.weights dqn.load_weights(weights_filename) dqn.evaluate(env, num_episodes=250, visualize=True, num_burn_in=5, action_repetition=1) # we upload our result to openai gym if args.monitor: env.close()
from rl.agents import DDPGAgent from rl.memory import SequentialMemory from rl.random import OrnsteinUhlenbeckProcess class MujocoProcessor(WhiteningNormalizerProcessor): def process_action(self, action): return np.clip(action, -1., 1.) ENV_NAME = 'HalfCheetah-v2' # Get the environment and extract the number of actions. env = gym.make(ENV_NAME) env = wrappers.Monitor(env, '/tmp/{}'.format(ENV_NAME), force=True) np.random.seed(123) env.seed(123) assert len(env.action_space.shape) == 1 nb_actions = env.action_space.shape[0] # Next, we build a very simple model. actor = Sequential() actor.add(Flatten(input_shape=(1,) + env.observation_space.shape)) actor.add(Dense(400)) actor.add(Activation('relu')) actor.add(Dense(300)) actor.add(Activation('relu')) actor.add(Dense(nb_actions)) actor.add(Activation('tanh')) print(actor.summary())
from sac2019 import SACAgent as SAC import numpy as np import os import torch import gym import pybullet_envs from gym import wrappers monitor_path = './monitor/' if not os.path.exists(monitor_path): os.makedirs(monitor_path) env_name = "AntBulletEnv-v0" env = gym.make(env_name) max_episode_steps = env._max_episode_steps env = wrappers.Monitor(env, monitor_path, force=True) start_timesteps = 10_000 eval_freq = 5_000 max_timesteps = 500_000 batch_size = 100 total_timesteps = 0 episode_reward = 0 episode_timesteps = 0 episode_num = 0 done = False obs = env.reset() gamma = 0.99 tau = 0.005 alpha = 0.2
return returns if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("env_name", type=str) args = parser.parse_args() env_name = args.env_name env = gym.make(env_name) env = wrappers.Monitor( env, f"Saved_Videos/hw1/dagger/{env_name}/", resume=True, force=True, video_callable=lambda episode: episode % 10 == 0, ) model = load_model(f"./models/hw1/{env_name}.h5") file = open(f"./expert_data/{env_name}.pkl", "rb") data = pickle.load(file) exp_observations, exp_actions = data["observations"], data["actions"] policy_fn = load_policy.load_policy(f"./experts/{env_name}.pkl") returns = dagger(exp_observations, exp_actions, model, max_steps=1000) print(f"returns = {returns}") print(f"mean return = {np.mean(returns)}")
flush_millis=10000, filename_suffix="-cartpole") optimizer = tf.train.AdamOptimizer(learning_rate=0.01) global_step = tf.Variable(0) net = Net(hidden_size, obs_size, n_actions) for iter_no, batch in enumerate(iterate_batches(env, net, batch_size)): obs_v, acts_v, reward_b, reward_m = filter_batch(batch, percentile) loss_v, grads = grad(net, obs_v, acts_v) optimizer.apply_gradients(zip(grads, net.trainable_variables), global_step) print("%d: loss=%.3f, reward_mean=%.1f, reward_bound=%.1f" % (iter_no, loss_v.numpy(), reward_m, reward_b)) with writer.as_default(), tf.contrib.summary.always_record_summaries(): tf.contrib.summary.scalar("loss", loss_v.numpy()) tf.contrib.summary.scalar("reward_bound", reward_b) tf.contrib.summary.scalar("reward_mean", reward_m) if reward_m > 199: print("Solved!") break env.close() writer.close() env = gym.make('CartPole-v0') env = wrappers.Monitor(env, '/tmp/cartpole-cross-entropy', force=True) play(net, env) env.close()
return img def make_env(): env_spec = gym.spec('ppaquette/DoomCorridor-v0') env_spec.id = 'DoomBasic-v0' env = env_spec.make() e = PreprocessImage(SkipWrapper(4)(ToDiscrete("minimal")(env)), width=80, height=80, grayscale=True) return e env = make_env() env = wrappers.Monitor(env, './experiment', force=True) NOOP, SHOOT, RIGHT, LEFT, FORWARD, TURN_R, TURN_L = 0, 1, 2, 3, 4, 5, 6 VALID_ACTIONS = [0, 1, 2, 3, 4, 5, 6] class Estimator(): def __init__(self, scope="estimator"): self.scope = scope with tf.variable_scope(scope): self._build_model() def _build_model(self): self.X_pl = tf.placeholder(shape=[None, 80, 80, 4], dtype=tf.float32, name="X")
avg_length = episode_lengths.mean() print("avg length:", avg_length) return avg_length def random_search(env): episode_lengths = [] best = 0 params = None for t in range(100): new_params = np.random.random(4) * 2 - 1 avg_length = play_multiple_episodes(env, 100, new_params) episode_lengths.append(avg_length) if avg_length > best: params = new_params best = avg_length return episode_lengths, params if __name__ == '__main__': env = gym.make('CartPole-v0') episode_lengths, params = random_search(env) plt.plot(episode_lengths) plt.show() # play a final set of episodes env = wrappers.Monitor(env, 'my_awesome_dir') print("***Final run with final weights***:", play_one_episode(env, params))
def evaluate(self, env, args, num_episodes, eval_count, max_episode_length=None, monitor=True): """Test your agent with a provided environment. You shouldn't update your network parameters here. Also if you have any layers that vary in behavior between train/test time (such as dropout or batch norm), you should set them to test. Basically run your policy on the environment and collect stats like cumulative reward, average episode length, etc. You can also call the render function here if you want to visually inspect your policy. """ print("Evaluation starts.") plt.figure(1, figsize=(22.5, 10)) is_training = False if self.load_network: # self.q_network.load_weights(self.load_network_path) # print("Load network from:", self.load_network_path) self.restore_model(self.load_network_path) if monitor: env = wrappers.Monitor(env, self.output_path_videos, video_callable=lambda x: True, resume=True) state = env.reset() idx_episode = 1 episode_frames = 0 episode_reward = np.zeros(num_episodes) t = 0 while idx_episode <= num_episodes: t += 1 action_state = self.history_processor.process_state_for_network( self.atari_processor.process_state_for_network(state)) action = self.select_action(action_state, is_training, policy_type='GreedyEpsilonPolicy') action_state_ori = self.history_processor.process_state_for_network_ori( self.atari_processor.process_state_for_network_ori(state)) # print "state.shape", state.shape # print "action_state_ori.shape", action_state_ori.shape dice = np.random.random() state, reward, done, info = env.step(action) if dice < 1e-1 and not (args.train): alpha_list = self.sess.run(self.q_network.alpha_list,\ feed_dict={self.q_network.imageIn: action_state[None, :, :, :], self.q_network.batch_size:1}) # print alpha_list, len(alpha_list), alpha_list[0].shape #10 (1, 49) for alpha_idx in range(len(alpha_list)): plt.subplot(2, len(alpha_list) // 2 + 1, alpha_idx + 1) img = action_state_ori[:, :, :, alpha_idx] #(210, 160, 3) plt.imshow(img) alp_curr = alpha_list[alpha_idx].reshape(7, 7) alp_img = skimage.transform.pyramid_expand(alp_curr, upscale=22, sigma=20) plt.imshow(scipy.misc.imresize( alp_img, (img.shape[0], img.shape[1])), alpha=0.7, cmap='gray') plt.axis('off') plt.subplot(2, action_state_ori.shape[3] // 2 + 1, action_state_ori.shape[3] + 2) plt.imshow(state) plt.savefig( '%sattention_ep%d-frame%d.png' % (self.output_path_images, eval_count, episode_frames)) print '---- Image saved at: %sattention_ep%d-frame%d.png' % ( self.output_path_images, eval_count, episode_frames) episode_frames += 1 episode_reward[idx_episode - 1] += reward if episode_frames > max_episode_length: done = True if done: print( "Eval: time %d, episode %d, length %d, reward %.0f. @eval_count %s" % (t, idx_episode, episode_frames, episode_reward[idx_episode - 1], eval_count)) eval_count += 1 save_scalar(eval_count, 'eval/eval_episode_raw_reward', episode_reward[idx_episode - 1], self.writer) save_scalar(eval_count, 'eval/eval_episode_raw_length', episode_frames, self.writer) sys.stdout.flush() state = env.reset() episode_frames = 0 idx_episode += 1 self.atari_processor.reset() self.history_processor.reset() reward_mean = np.mean(episode_reward) reward_std = np.std(episode_reward) print( "Evaluation summury: num_episodes [%d], reward_mean [%.3f], reward_std [%.3f]" % (num_episodes, reward_mean, reward_std)) sys.stdout.flush() return reward_mean, reward_std, eval_count
import gym from gym import wrappers import numpy as np env = gym.make("FrozenLake-v0") env = wrappers.Monitor(env, "./results", force=True) Q = np.zeros([env.observation_space.n, env.action_space.n]) n_s_a = np.zeros([env.observation_space.n, env.action_space.n]) num_episodes = 100000 epsilon = 0.2 rList = [] for i in range(num_episodes): state = env.reset() rAll = 0 done = False results_list = [] result_sum = 0.0 while not done: if np.random.rand() < epsilon: action = env.action_space.sample() else: action = np.argmax(Q[state, :]) new_state, reward, done, _ = env.step(action) results_list.append((state, action)) result_sum += reward state = new_state rAll += reward rList.append(rAll)
def start_record(self, render=False): if not render: self.env.render(close=True) self.env = wrappers.Monitor(self.env, self.monitor_path, force=True)
def evaluate_env(env_name, seed, policy_hidden_size, stochastic, reuse, prefix): def get_checkpoint_dir(checkpoint_list, limit, prefix): for checkpoint in checkpoint_list: if ('limitation_' + str(limit) in checkpoint) and (prefix in checkpoint): return checkpoint return None def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=policy_hidden_size, num_hid_layers=2) data_path = os.path.join('data', 'deterministic.trpo.' + env_name + '.0.00.npz') dataset = load_dataset(data_path) checkpoint_list = glob.glob( os.path.join('checkpoint', '*' + env_name + ".*")) log = { 'traj_limitation': [], 'upper_bound': [], 'avg_ret': [], 'avg_len': [], 'normalized_ret': [] } for i, limit in enumerate(CONFIG['traj_limitation']): # Do one evaluation upper_bound = sum(dataset.rets[:limit]) / limit checkpoint_dir = get_checkpoint_dir(checkpoint_list, limit, prefix=prefix) checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir) env = gym.make(env_name + '-v1') # FIX: with MuJoCo 1.50, MuJoCo envs are -v2 env = wrappers.Monitor( env, checkpoint_dir, force=True ) # ENHANCEMENT: Generate and save videos to checkpoint_dir # Errors with ERROR: GLEW initalization error: Missing GL version on MuJoCo 1.50, set LD_PRELOAD # https://github.com/openai/mujoco-py/issues/44 env.seed(seed) print('Trajectory limitation: {}, Load checkpoint: {}, '.format( limit, checkpoint_path)) avg_len, avg_ret = run_mujoco.runner(env, policy_fn, checkpoint_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=stochastic, reuse=((i != 0) or reuse)) normalized_ret = avg_ret / upper_bound print('Upper bound: {}, evaluation returns: {}, normalized scores: {}'. format(upper_bound, avg_ret, normalized_ret)) log['traj_limitation'].append(limit) log['upper_bound'].append(upper_bound) log['avg_ret'].append(avg_ret) log['avg_len'].append(avg_len) log['normalized_ret'].append(normalized_ret) env.close() return log
def sim_agent(env, policy, task, scaler, num_episodes_sim=1, animate=False, save_video=False, out_dir='./video'): """ Simulates trainned agent (Policy) in given environment (env) Args: env: ai gym environment policy: policy object with sample() method task: int indicating which head (task specific hidden layer) of the policy to use num_episodes_sim (int): number of episodes to simulate animate (bool): determines if video should be rendered in window save_video (bool): enables saving video and other stats of simulated episodes Returns: mean_reward_episodes (double): Mean reward obtained across all episodes if save_video=True, stores videos and stats in folder determined by 'out_dir' """ # Monitoring Config if save_video: if not os.path.exists(out_dir): os.makedirs(out_dir) # create directory if it doesn't exist env = wrappers.Monitor(env, out_dir, force=True) # Used to save log data and video # Simulate each Episode episodes_tot_reward = [] for episode in range(num_episodes_sim): obs = env.reset() reward_sum = 0 done = False step = 0.0 scale, offset = scaler.get() # standardize observations scale[-1] = 1.0 # don't scale time the "step" additional feature offset[-1] = 0.0 # don't offset time the "step" additional feature # Start Env. Simulation while not done: if animate: env.render() # Modify Observation: Additoinal feature + standardizing based on running mean / std obs = obs.astype(np.float32).reshape((1, -1)) obs = np.append(obs, [[step]], axis=1) # add time step as additonal feature obs = (obs - offset) * scale # center and scale observations # Act based on Policy Network action = policy.sample(obs, task).reshape( (1, -1)).astype(np.float32) obs, reward, done, _ = env.step(np.squeeze(action, axis=0)) reward_sum += reward step += 1e-3 # increment time step feature # Accumualte info for Episode episodes_tot_reward.append(reward_sum) # Get Stats over all episodes mean_reward_episodes = np.mean(episodes_tot_reward) return mean_reward_episodes
G = np.dot(multiplier, quess_rewards) model.update(states[0], actions[0], G) rewards.pop(0) actions.pop(0) states.pop(0) return totalreward if __name__ == "__main__": env = gym.make("MountainCar-v0") ft = FeatureTransformer(env) model = Model(env, ft) if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = 'Videos/' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) gamma = 0.99 N = 300 totalrewards = np.empty(N) for n in range(N): eps = 1.0 / (0.1 * n + 1) totalreward = play_one_episode(model, eps, gamma) totalrewards[n] = totalreward print("episode:", n, "total reward:", totalreward, "eps: ", eps) print("avg reward for last 100 episodes:", totalrewards[-100:].mean()) print("total steps:", -totalrewards.sum()) plt.plot(totalrewards) plt.show()
.format(episode, n_train_episodes, episode_rewards, epsilon)) break rewards.append(episode_rewards) epsilon = update_epsilon(epsilon) # PLOT RESULTS x = range(n_train_episodes) plt.plot(x, rewards) plt.xlabel('Episode number') plt.ylabel('Training cumulative reward') plt.savefig('DQN_CartPole.png', dpi=300) plt.show() # TEST PHASE env = wrappers.Monitor(env, './videos/' + str(time()) + '/') for episode in range(n_test_episodes): current_state = env.reset() current_state = preprocess_state(current_state) episode_rewards = 0 for t in range(n_steps): env.render() action = greedy_policy(current_state) next_state, reward, done, _ = env.step(action) next_state = preprocess_state(next_state) memory.append((current_state, action, reward, next_state, done)) current_state = next_state episode_rewards += reward if done:
action="store", default=500, help="Nombre d\'épisodes d\'apprentissage.") parser.add_argument("-t", "--test", type=int, action="store", default=0, help="Nombre d\'épisodes de test") args = parser.parse_args() logger.set_level(logger.INFO) env = gym.make("BreakoutNoFrameskip-v4") # Monitor gym pour la vidéo outdir = '/tmp/random-agent-results' env = wrappers.Monitor(env, directory=outdir, force=True) env.seed(0) # hyperparamètres EXPLO = ["greedy", "boltzmann"] TARGET_UPDATE = ["freq", "polyak"] PARAMS = { "gamma": 0.8, "max_tau": 1, "min_tau": 0.1, "tau_decay": 0.99, "exploration": EXPLO[0], "sigma": 1e-3, "alpha": 0.005, "m": 4, "frame_skip": 4,
print(env.action_space) # Quelles sont les actions possibles print(env.step(1)) # faire action 1 et retourne l'observation, le reward, et un done un booleen (jeu fini ou pas) env.render() # permet de visualiser la grille du jeu env.render(mode="human") #visualisation sur la console #statedic, mdp = env.getMDP() # recupere le mdp : statedic #print("Nombre d'etats : ",len(statedic)) # nombre d'etats ,statedic : etat-> numero de l'etat #state, transitions = list(mdp.items())[0] #print(state) # un etat du mdp #print(transitions) # dictionnaire des transitions pour l'etat : {action-> [proba,etat,reward,done]} # Execution avec un Agent agent = RandomAgent(env.action_space) # Faire un fichier de log sur plusieurs scenarios outdir = 'gridworld-v0/random-agent-results' envm = wrappers.Monitor(env, directory=outdir, force=True, video_callable=False) env.seed() # Initialiser le pseudo aleatoire episode_count = 2000 reward = 0 done = False rsum = 0 FPS = 0.0001 all_rsum = [] for i in range(episode_count): obs = envm.reset() env.verbose = (i % 100 == 0 and i > 0) # afficher 1 episode sur 100 if env.verbose: env.render(FPS) j = 0 #rsum = 0
done = False reward = 0.0 while not done: action = scale_action(env, agent.step(obs, reward)) obs, reward, done, _ = env.step(action) reward_tot += reward env.render() agent.step(obs, reward) agent.reset() return reward_tot if __name__ == '__main__': random.seed(0) np.random.seed(0) torch.manual_seed(0) env_to_wrap = gym.make("Pendulum-v0") env = wrappers.Monitor(env_to_wrap, 'logging/', force=True, video_callable=lambda episode_id: True) agent = ModelAgent(env.observation_space.shape[0], env.action_space.shape[0]) for i in range(10): reward_tot = run_episode(env, agent) print("Episode: ", i + 1, "---", "Total Reward: ", reward_tot) env.close()
def eva(self): agent = DDQN_Agent(n_states=self.n_states, n_actions=self.n_actions, batch_size=self.config.batch_size, hidden_size=self.config.hidden_size, memory_size=self.config.memory_size, update_step=self.config.update_step, learning_rate=self.config.learning_rate, gamma=self.config.gamma, tau=self.config.tau) test_reward_array = np.zeros(100) # load check point to restore the model agent.policy_model.load_state_dict( torch.load(self.config.DDQN_CHECKPOINT_PATH, map_location=agent.device)) t = trange(self.config.test_episodes) for episode in t: state = self.env.reset() done = False rewards = 0 while not done: # disable epsilon greedy search action = agent.act(state, epsilon=0) state, reward, done, _ = self.env.step(action) rewards += reward t.set_description('Episode {:.2f} Reward {:.2f}'.format( episode + 1, rewards)) t.refresh() test_reward_array[episode] = rewards self.env.close() # show the evaluation results avg_test_reward = round(np.mean(test_reward_array), 2) plt.subplots(figsize=(5, 5), dpi=100) plt.plot(test_reward_array) plt.ylabel('Total Reward', fontsize=12) plt.xlabel('Trial', fontsize=12) plt.xticks(fontsize=12) plt.yticks(fontsize=12) plt.title( 'Total Rewards Per Trial for 100 Trials - Average: {:.2f}'.format( avg_test_reward), fontsize=12) plt.savefig(self.config.DDQN_RESULT_IMG_PATH.format(1), dpi=100, bbox_inches='tight') print('\nSave evaluation rewards plot as {}.'.format( self.config.DDQN_RESULT_IMG_PATH.format(1))) # play a round env = wrappers.Monitor(self.env, self.config.DDQN_AGENT_PATH, force=True) state = env.reset() done = False rewards = 0. while not done: # disable epsilon greedy search action = agent.act(state, epsilon=0) state, reward, done, _ = env.step(action) rewards += reward env.close() print('Total rewards in a game: {:.2f}'.format(rewards)) print('Save video record to {}.'.format(self.config.DDQN_AGENT_PATH))