def main(): if(len(sys.argv) != 5): print("usage:{} <env> <model_json> <weights> <directory>".format(sys.argv[0])) return sys.exit() env = gym.make(sys.argv[1]) env.frameskip = 1 with open(sys.argv[2]) as json_file: model = model_from_json(json.load(json_file),{"Eq9":Eq9}) model.load_weights(sys.argv[3]) epsilon = 0.01 input_shape = (84,84) history_size = 4 eval_size = 1 directory = sys.argv[4] history_prep = HistoryPreprocessor(history_size) atari_prep = AtariPreprocessor(input_shape,0,999) numpy_prep = NumpyPreprocessor() preprocessors = PreprocessorSequence([atari_prep, history_prep, numpy_prep]) #from left to right policy = GreedyEpsilonPolicy(epsilon) agent = DQNAgent(model, preprocessors, None, policy, 0.99, None,None,None,None) env = gym.wrappers.Monitor(env,directory,force=True) reward_arr, length_arr = agent.evaluate_detailed(env,eval_size,render=False, verbose=True)
def setUpClass(cls): cls.env = gym.make("Breakout-v0") history_prep = HistoryPreprocessor(4) atari_prep = AtariPreprocessor((84, 84), 0, 999) numpy_prep = NumpyPreprocessor() cls.preprocessors = PreprocessorSequence( [atari_prep, history_prep, numpy_prep]) #from left to right cls.atari_prep = atari_prep
def __init__(self, q_network, target_netwrok, policy, gamma, num_burn_in, train_freq, batch_size, config): self.q = q_network self.q_target = target_netwrok self.memory = ReplayMemory(config) self.policy = policy self.gamma = gamma self.num_burn_in = num_burn_in self.train_freq = train_freq self.batch_size = batch_size self.currentIter = 0 self.currentEps = 0 self.currentReward = 0 self.config = config ##### self.historyPre = HistoryPreprocessor(config) self.AtariPre = AtariPreprocessor(config) pass
def main(): # noqa: D103 parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='Breakout-v0', help='Atari env name') parser.add_argument('-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') args = parser.parse_args() #args.input_shape = tuple(args.input_shape) #args.output = get_output_folder(args.output, args.env) #set up environment model env = gym.make(str(args.env)) NUM_ACTIONS = env.action_space.n #env.get_action_space().num_actions() #make dqn agent FRAMES_PER_STATE = 4 INPUT_SHAPE = (84, 84) GAMMA = .99 NUM_ITERATIONS = 5000000 TARGET_UPDATE_FREQ = 10000 NUM_BURN_IN = 32 TRAIN_FREQ = 0 BATCH_SIZE = 32 REPLAY_MEM_SIZE = 1000000 REPLAY_START_SIZE = 50000 MAX_EPISODE_LEN = 10000 HELD_OUT_STATES_SIZE = 1000 IS_DOUBLE_Q = True model = create_model(FRAMES_PER_STATE, INPUT_SHAPE, NUM_ACTIONS, model_name='linear q_network') plot_model(model, to_file='model.png') target = create_model(FRAMES_PER_STATE, INPUT_SHAPE, NUM_ACTIONS, model_name='linear q_network target') preprocessor = HistoryPreprocessor(FRAMES_PER_STATE - 1) memory = ReplayMemory(REPLAY_MEM_SIZE, FRAMES_PER_STATE) held_out_states = ReplayMemory(HELD_OUT_STATES_SIZE, FRAMES_PER_STATE) policy = LinearDecayGreedyEpsilonPolicy(1, .05, int(1e6)) agent = DQNAgent(model, target, preprocessor, memory, policy, held_out_states, HELD_OUT_STATES_SIZE, GAMMA, TARGET_UPDATE_FREQ, NUM_BURN_IN, TRAIN_FREQ, BATCH_SIZE, REPLAY_START_SIZE, NUM_ACTIONS, IS_DOUBLE_Q) # compile agent adam = Adam(lr=0.0001) agent.compile(adam, mean_huber_loss) agent.fit(env, NUM_ITERATIONS, MAX_EPISODE_LEN)
def test_detail(self): memory = ActionReplayMemory(250, 4) #test memory memory_old = ActionReplayMemoryOld(250, 4) index = 0 h_prep = HistoryPreprocessor(4) np_prep = NumpyPreprocessor() preprocessors = PreprocessorSequence([h_prep, np_prep]) for x in range(0, 1000): axr = np.random.randint(0, 100, (84, 84)) prep_state = preprocessors.process_state_for_memory(axr) memory.append(prep_state, 4, 5) memory_old.append(prep_state, 4, 5) for t in range(0, 10): batch_size = 32 indexes = (np.random.randint(0, memory._filled_size, size=batch_size)).tolist() curr_arr, next_arr, reward_arr, action_arr, terminal_arr = memory.sample( batch_size, indexes) curr_arr2, next_arr2, reward_arr2, action_arr2, terminal_arr2 = memory_old.sample( batch_size, indexes) for i, terminal in enumerate(terminal_arr): empty_arr = np.zeros((84, 84)) for d in range(0, 4): self.assertTrue(not np.all(curr_arr[i][:, :, d] == empty_arr)) self.assertTrue( np.all(curr_arr[i][:, :, d] == curr_arr2[i][:, :, d])) if (indexes[i] >= 4): self.assertTrue( np.all(curr_arr[i][:, :, 1] == memory.survey(indexes[i] - 1))) self.assertTrue( np.all(curr_arr[i][:, :, 2] == memory.survey(indexes[i] - 2))) self.assertTrue( np.all(curr_arr[i][:, :, 3] == memory.survey(indexes[i] - 3))) self.assertTrue( np.all(curr_arr[i][:, :, 0] == curr_arr2[i][:, :, 0]))
def main(): if (len(sys.argv) != 6): print("usage:{} <env> <model_json> <weights> <render> <random>".format( sys.argv[0])) return sys.exit() env = gym.make(sys.argv[1]) env.frameskip = 1 with open(sys.argv[2]) as json_file: model = model_from_json(json.load(json_file), {"Eq9": Eq9}) model.load_weights(sys.argv[3]) epsilon = 0.01 input_shape = (84, 84) history_size = 4 eval_size = 100 render = (sys.argv[4] == "y") history_prep = HistoryPreprocessor(history_size) atari_prep = AtariPreprocessor(input_shape, 0, 999) numpy_prep = NumpyPreprocessor() preprocessors = PreprocessorSequence( [atari_prep, history_prep, numpy_prep]) #from left to right if (sys.argv[5] == "y"): print("using random policy") policy = UniformRandomPolicy(env.action_space.n) else: print("using greedy policy") policy = GreedyEpsilonPolicy(epsilon) agent = DQNAgent(model, preprocessors, None, policy, 0.99, None, None, None, None) agent.add_keras_custom_layers({"Eq9": Eq9}) reward_arr, length_arr = agent.evaluate_detailed(env, eval_size, render=render, verbose=True) print("\rPlayed {} games, reward:M={}, SD={} length:M={}, SD={}".format( eval_size, np.mean(reward_arr), np.std(reward_arr), np.mean(length_arr), np.std(reward_arr))) print("max:{} min:{}".format(np.max(reward_arr), np.min(reward_arr))) plt.hist(reward_arr) plt.show()
def main(): # noqa: D103 parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='Breakout-v0', help='Atari env name') parser.add_argument('-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') args = parser.parse_args() #args.input_shape = tuple(args.input_shape) #args.output = get_output_folder(args.output, args.env) #set up environment model env = gym.make(str(args.env)) NUM_ACTIONS = env.action_space.n #env.get_action_space().num_actions() #make dqn agent FRAMES_PER_STATE = 4 INPUT_SHAPE = (84, 84) GAMMA = .99 NUM_ITERATIONS = 500000 TARGET_UPDATE_FREQ = 0 NUM_BURN_IN = 0 TRAIN_FREQ = 0 BATCH_SIZE = 0 model = create_model(FRAMES_PER_STATE, INPUT_SHAPE, NUM_ACTIONS, model_name='linear q_network') preprocessor = HistoryPreprocessor(FRAMES_PER_STATE - 1) memory = None policy = LinearDecayGreedyEpsilonPolicy(1, .05, 10e6) agent = DQNAgent(model, preprocessor, memory, policy, GAMMA, TARGET_UPDATE_FREQ, NUM_BURN_IN, TRAIN_FREQ, BATCH_SIZE) #compile agent adam = Adam(lr=0.0001) loss = losses.mean_squared_error agent.compile(adam, loss) agent.fit(env, NUM_ITERATIONS)
def testHistoryPreprocessor(): a = np.array([[1,1],[1,1]]) b = np.array([[2,2],[2,2]]) c = np.array([[3,3],[3,3]]) d = np.array([[4,4],[4,4]]) e = np.array([[5,5],[5,5]]) hp = HistoryPreprocessor(a.shape, 3) history = np.array([[[0.,0.,0.], \ [0.,0.,0.]], \ [[0.,0.,0.], \ [0.,0.,0.]]]) assert(np.array_equal(hp.history, history)) history = np.array([[[0.,0.,1.], \ [0.,0.,1.]], \ [[0.,0.,1.], \ [0.,0.,1.]]]) assert(np.array_equal(hp.process_state_for_network(a), history)) history = np.array([[[0.,1.,2.], \ [0.,1.,2.]], \ [[0.,1.,2.], \ [0.,1.,2.]]]) assert(np.array_equal(hp.process_state_for_network(b), history)) history = np.array([[[1.,2.,3.], \ [1.,2.,3.]], \ [[1.,2.,3.], \ [1.,2.,3.]]]) assert(np.array_equal(hp.process_state_for_network(c), history)) history = np.array([[[2.,3.,4.], \ [2.,3.,4.]], \ [[2.,3.,4.], \ [2.,3.,4.]]]) assert(np.array_equal(hp.process_state_for_network(d), history)) history = np.array([[[3.,4.,5.], \ [3.,4.,5.]], \ [[3.,4.,5.], \ [3.,4.,5.]]]) assert(np.array_equal(hp.process_state_for_network(e), history))
def main(): parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='Breakout-v0', help='Atari env name') parser.add_argument('-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') parser.add_argument('--type', default="DQN", help='Type of network to train. ()') args = parser.parse_args() #check if valid network type network_types = [ "Linear", "LinearERTF", "DoubleLinear", "DQN", "DDQN", "Duling" ] if (not (args.type in network_types)): raise ValueError("Invalid network type.") NETWORK_TYPE = args.type #set up environment model env = gym.make(str(args.env)) NUM_ACTIONS = env.action_space.n #make dqn agent """ FRAMES_PER_STATE = 4 INPUT_SHAPE = (84,84) GAMMA = .99 NUM_ITERATIONS = 1000000 TARGET_UPDATE_FREQ = 100000 BATCH_SIZE = 32 REPLAY_MEM_SIZE = 1000000 REPLAY_START_SIZE = 50000 MAX_EPISODE_LEN = 100 REWARD_SAMPLE = 1000 HELD_OUT_STATES_SIZE=1000 """ FRAMES_PER_STATE = 4 INPUT_SHAPE = (84, 84) GAMMA = .99 NUM_ITERATIONS = 20000 TARGET_UPDATE_FREQ = 1000 BATCH_SIZE = 32 REPLAY_MEM_SIZE = 1000000 REPLAY_START_SIZE = 1000 MAX_EPISODE_LEN = 10 REWARD_SAMPLE = 1000 HELD_OUT_STATES_SIZE = 1000 #retuns a list of models ie: [Online,None] or [Online,Target] or [OnlineA,OnlineB] models = create_model(FRAMES_PER_STATE, INPUT_SHAPE, NUM_ACTIONS, NETWORK_TYPE) history = HistoryPreprocessor(FRAMES_PER_STATE - 1) preprocessor = Preprocessor() if (NETWORK_TYPE != "Linear"): memory = ReplayMemory(REPLAY_MEM_SIZE, FRAMES_PER_STATE) else: memory = None held_out_states = ReplayMemory(HELD_OUT_STATES_SIZE, FRAMES_PER_STATE) policy = LinearDecayGreedyEpsilonPolicy(1, .05, int(1e6)) agent = DQNAgent(models[0], models[1], preprocessor, history, memory, policy, GAMMA, TARGET_UPDATE_FREQ, BATCH_SIZE, REPLAY_START_SIZE, NUM_ACTIONS, NETWORK_TYPE, REWARD_SAMPLE, held_out_states, HELD_OUT_STATES_SIZE) #compile agent adam = Adam(lr=0.0001) loss = mean_huber_loss agent.compile(adam, loss) agent.fit(env, NUM_ITERATIONS, MAX_EPISODE_LEN) model_json = models[0].to_json() with open(NETWORK_TYPE + "model.json", "w") as json_file: json_file.write(model_json) # serialize weights to HDF5 models[0].save_weights(NETWORK_TYPE + "model.h5") print("Saved model to disk")
def main(): #env = gym.make("Enduro-v0") #env = gym.make("SpaceInvaders-v0") #env = gym.make("Breakout-v0") model_name = "q2" if (len(sys.argv) >= 2): model_name = sys.argv[1] if (len(sys.argv) >= 3): env = gym.make(sys.argv[2]) else: #env = gym.make("Enduro-v0") env = gym.make("SpaceInvaders-v0") #env = gym.make("Breakout-v0") #no skip frames env.frameskip = 1 input_shape = (84, 84) batch_size = 1 num_actions = env.action_space.n memory_size = 2 #2 because it need to save the current state and the future state, no matter what it gets, it will always just pick the earlier one memory_burn_in_num = 1 start_epsilon = 1 end_epsilon = 0.01 decay_steps = 1000000 target_update_freq = 1 #no targeting train_freq = 4 #How often you train the network history_size = 4 history_prep = HistoryPreprocessor(history_size) atari_prep = AtariPreprocessor(input_shape, 0, 999) numpy_prep = NumpyPreprocessor() preprocessors = PreprocessorSequence( [atari_prep, history_prep, numpy_prep]) #from left to right policy = LinearDecayGreedyEpsilonPolicy(start_epsilon, end_epsilon, decay_steps) linear_model = create_model(history_size, input_shape, num_actions, model_name) optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) loss_func = huber_loss #linear_model.compile(optimizer, loss_func) linear_model.summary() random_policy = UniformRandomPolicy(num_actions) #memory = ActionReplayMemory(1000000,4) memory = ActionReplayMemory(memory_size, history_size) #memory_burn_in(env,memory,preprocessors,memory_burn_in_num,random_policy) #print(reward_arr) #print(curr_state_arr) agent = DQNAgent(linear_model, preprocessors, memory, policy, 0.99, target_update_freq, None, train_freq, batch_size) agent.compile(optimizer, loss_func) agent.save_models() agent.fit(env, 1000000, 100000)
class DQNAgent: """Class implementing DQN. This is a basic outline of the functions/parameters you will need in order to implement the DQNAgnet. This is just to get you started. You may need to tweak the parameters, add new ones, etc. Feel free to change the functions and funciton parameters that the class provides. We have provided docstrings to go along with our suggested API. Parameters ---------- q_network: keras.models.Model Your Q-network model. preprocessor: deeprl_hw2.core.Preprocessor The preprocessor class. See the associated classes for more details. memory: deeprl_hw2.core.Memory Your replay memory. gamma: float Discount factor. target_update_freq: float Frequency to update the target network. You can either provide a number representing a soft target update (see utils.py) or a hard target update (see utils.py and Atari paper.) num_burn_in: int Before you begin updating the Q-network your replay memory has to be filled up with some number of samples. This number says how many. train_freq: int How often you actually update your Q-Network. Sometimes stability is improved if you collect a couple samples for your replay memory, for every Q-network update that you run. batch_size: int How many samples in each minibatch. """ def __init__(self, q_network, target_netwrok, policy, gamma, num_burn_in, train_freq, batch_size, config): self.q = q_network self.q_target = target_netwrok self.memory = ReplayMemory(config) self.policy = policy self.gamma = gamma self.num_burn_in = num_burn_in self.train_freq = train_freq self.batch_size = batch_size self.currentIter = 0 self.currentEps = 0 self.currentReward = 0 self.config = config ##### self.historyPre = HistoryPreprocessor(config) self.AtariPre = AtariPreprocessor(config) pass def compile(self, optimizer, loss_func): """Setup all of the TF graph variables/ops. This is inspired by the compile method on the keras.models.Model class. This is a good place to create the target network, setup your loss function and any placeholders you might need. You should use the mean_huber_loss function as your loss_function. You can also experiment with MSE and other losses. The optimizer can be whatever class you want. We used the keras.optimizers.Optimizer class. Specifically the Adam optimizer. """ pass def calc_q_values(self, state, network): """Given a state (or batch of states) calculate the Q-values. Basically run your network on these states. Return ------ Q-values for the state(s) """ state_pre = np.zeros((1, 4, 84, 84), dtype=np.float32) state_pre[0] = state q_values = network.predict(state_pre, batch_size=1)[0] return q_values def select_action(self, state, network, **kwargs): """Select the action based on the current state. You will probably want to vary your behavior here based on which stage of training your in. For example, if you're still collecting random samples you might want to use a UniformRandomPolicy. If you're testing, you might want to use a GreedyEpsilonPolicy with a low epsilon. If you're training, you might want to use the LinearDecayGreedyEpsilonPolicy. This would also be a good place to call process_state_for_network in your preprocessor. Returns -------- selected action """ state_pre = np.zeros((1, 4, 84, 84), dtype=np.float32) state_pre[0] = state q_values = network.predict(state_pre, batch_size=1)[0] return self.policy.select_action(q_values) def fit(self, env, num_iterations, max_episode_length=None): """Fit your model to the provided environment. Its a good idea to print out things like loss, average reward, Q-values, etc to see if your agent is actually improving. You should probably also periodically save your network weights and any other useful info. This is where you should sample actions from your network, collect experience samples and add them to your replay memory, and update your network parameters. Parameters ---------- env: gym.Env This is your Atari environment. You should wrap the environment using the wrap_atari_env function in the utils.py num_iterations: int How many samples/updates to perform. max_episode_length: int How long a single episode should last before the agent resets. Can help exploration. """ cnt = np.long(0) episode_rwd = 0 _screen_raw = self.process_env_reset(env) # Save to history mse_loss, mae_metric = 0, 0 self.policy = UniformRandomPolicy(env.action_space.n) evaluation_interval_cnt = 0 while cnt < num_iterations: cnt += 1 evaluation_interval_cnt += 1 current_state = self.historyPre.get_current_state() action = self.select_action(current_state, self.q) # Get action _screen_next_raw, reward, isterminal, _ = env.step( action) # take action, observe new episode_rwd += reward _screen_raw = self.process_one_screen( _screen_raw, action, reward, _screen_next_raw, isterminal, True) # Save to history, Memory # print "\t state: %d, Step: %d, reward: %d, terminal: %d, Observe: %d" \ # % (np.matrix(_screen).sum(), action, reward, isterminal, np.matrix(_screen_next).sum()) # env.render() if isterminal: # reset if evaluation_interval_cnt >= self.config.evaluation_interval: Aver_reward = self.evaluate(env, self.config.eval_batch_num) # print ("----------Evaluate, Average reward", Aver_reward) evaluation_interval_cnt = 0 with open(self.config.rewardlog, "a") as log: log.write(",".join([ str(int(cnt / self.config.evaluation_interval)), str(Aver_reward) ]) + "\n") _screen_raw = self.process_env_reset(env) # print ("Episode End, iter: ", cnt, "last batch loss: ", mse_loss, 'last mae Metric: ', mae_metric, "Episode reward: ", episode_rwd) episode_rwd = 0 if cnt >= self.num_burn_in and cnt % self.train_freq == 0: # update samples = self.AtariPre.process_batch( self.memory.sample(self.batch_size)) x = np.zeros( (self.batch_size, self.config.history_length, self.config.screen_height, self.config.screen_width), dtype=np.float32) y = np.zeros((self.batch_size, int(action_size(env))), dtype=np.float32) for _index in range(len(samples)): sample = samples[_index] x[_index] = np.copy(sample.state) if sample.is_terminal: y[_index] = self.calc_q_values(sample.state, self.q) y[_index][sample.action] = sample.reward else: y[_index] = self.calc_q_values(sample.state, self.q) q_next = max( self.calc_q_values( sample.next_state, self.q_target)) # Use max to update y[_index][sample. action] = sample.reward + self.gamma * q_next mse_loss, mae_metric = self.q.train_on_batch(x, y) with open(self.config.losslog, "a") as log: log.write(",".join( [str(cnt / 4), str(mse_loss), str(mae_metric)]) + "\n") # print(cnt, mse_loss, mae_metric) if cnt % self.config.target_q_update_step == 0: # Set q == q^ self.q_target.set_weights(self.q.get_weights()) if cnt == self.config.memory_size: # change Policy self.policy = LinearDecayGreedyEpsilonPolicy( 1, 0.05, self.config.decayNum) if cnt % (num_iterations / 3) == 0: # Save model TimeStamp = datetime.datetime.strftime(datetime.datetime.now(), "%y-%m-%d_%H-%M") self.q.save_weights( str(self.config.modelname) + '_' + TimeStamp + '_weights.h5') return mse_loss, mae_metric, self.q, self.q_target def process_one_screen(self, screen_raw, action, reward, screen_next_raw, isterminal, Is_train): screen_32_next = self.AtariPre.process_state_for_network( screen_next_raw) screen_8 = self.AtariPre.process_state_for_memory(screen_raw) self.historyPre.insert_screen(screen_32_next) if Is_train: self.memory.append(screen_8, action, reward, isterminal) return screen_next_raw def process_env_reset(self, env): self.historyPre.reset() screen_raw = env.reset() screen_32 = self.AtariPre.process_state_for_network(screen_raw) self.historyPre.insert_screen(screen_32) return screen_raw def evaluate(self, env, num_episodes): """Test your agent with a provided environment. You shouldn't update your network parameters here. Also if you have any layers that vary in behavior between train/test time (such as dropout or batch norm), you should set them to test. Basically run your policy on the environment and collect stats like cumulative reward, average episode length, etc. You can also call the render function here if you want to visually inspect your policy. """ eval_policy = GreedyEpsilonPolicy(self.config.epsilon) cumu_reward = 0 epscnt = 0 while epscnt < num_episodes: isterminal = False _screen_raw = self.process_env_reset(env) # Save to history while not isterminal: current_state = self.historyPre.get_current_state() action = self.select_action_test(current_state, eval_policy) # Get action _screen_next_raw, reward, isterminal, _ = env.step( action) # take action, observe new cumu_reward += reward _screen_raw = self.process_one_screen( _screen_raw, action, reward, _screen_next_raw, isterminal, True) # Save to history, Memory epscnt += 1 return cumu_reward / num_episodes def select_action_test(self, state, policy, **kwargs): """Select the action based on the current state. You will probably want to vary your behavior here based on which stage of training your in. For example, if you're still collecting random samples you might want to use a UniformRandomPolicy. If you're testing, you might want to use a GreedyEpsilonPolicy with a low epsilon. If you're training, you might want to use the LinearDecayGreedyEpsilonPolicy. This would also be a good place to call process_state_for_network in your preprocessor. Returns -------- selected action """ state_pre = np.zeros((1, 4, 84, 84), dtype=np.float32) state_pre[0] = state q_values = self.q.predict(state_pre, batch_size=1)[0] return policy.select_action(q_values)
def main(): # load json and create model json_file = open( '/home/shivang/Desktop/HW2TomShivang/deeprl_hw2_src_DQNv2/model.json', 'r') loaded_model_json = json_file.read() json_file.close() model = model_from_json(loaded_model_json) # load weights into new model model.load_weights("model.h5") print("Loaded model from disk") parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='Breakout-v0', help='Atari env name') parser.add_argument('-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') args = parser.parse_args() # args.input_shape = tuple(args.input_shape) # args.output = get_output_folder(args.output, args.env) # set up environment model env1 = gym.make(str(args.env)) NUM_ACTIONS = env1.action_space.n # env.get_action_space().num_actions() # make dqn agent FRAMES_PER_STATE = 4 MAX_EPISODE_LEN = 1000 preprocessor = HistoryPreprocessor(FRAMES_PER_STATE - 1) policy = LinearDecayGreedyEpsilonPolicy(1, .05, int(1e6)) preprocessor = HistoryPreprocessor(FRAMES_PER_STATE - 1) # evaluate loaded model on test data #compile agent adam = Adam(lr=0.0001) loss = mean_huber_loss model.compile(loss=loss, optimizer=adam) max_episode_length = MAX_EPISODE_LEN num_episodes = 20 """Test your agent with a provided environment. You shouldn't update your network parameters here. Also if you have any layers that vary in behavior between train/test time (such as dropout or batch norm), you should set them to test. Basically run your policy on the environment and collect stats like cumulative reward, average episode length, etc. You can also call the render function here if you want to visually inspect your policy. """ cumulative_reward = 0 actions = np.zeros(env1.action_space.n) no_op_max = 30 for episodes in range(num_episodes): if episodes < 4: env = wrappers.Monitor( env1, '/home/shivang/Desktop/HW2TomShivang/Video_evaluation/' + str(episodes) + '/', force=True) else: env = env1 # get initial state preprocessor.reset() preprocessor.process_state_for_network(env.reset()) state = preprocessor.frames steps = 0 q_vals_eval = np.zeros(no_op_max) for i in range(no_op_max): q_vals = model.predict(state) (next_state, reward, is_terminal, info) = env.step(0) preprocessor.process_state_for_network(next_state) next_state = preprocessor.frames actions[0] += 1 steps = steps + 1 q_vals_eval[i] = q_vals_eval[i] + max(q_vals[0]) if is_terminal: state = env.reset() else: state = next_state while steps < max_episode_length: q_vals = model.predict(state) action = np.argmax(q_vals[0]) actions[action] += 1 (next_state, reward, is_terminal, info) = env.step(action) # reward = self.preprocessor.process_reward(reward) cumulative_reward = cumulative_reward + reward preprocessor.process_state_for_network(next_state) next_state = preprocessor.frames state = next_state steps = steps + 1 if is_terminal: break print(actions) avg_reward = cumulative_reward / num_episodes avg_qval = np.mean(q_vals_eval) / num_episodes print(avg_reward) print(avg_qval)
def main(): parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='Breakout-v0', help='Atari env name') parser.add_argument('-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') parser.add_argument('--mode', choices=['train', 'test'], default='test') parser.add_argument('--network', choices=['deep', 'linear'], default='deep') parser.add_argument('--method', choices=['dqn', 'double', 'dueling'], default='dqn') parser.add_argument('--monitor', type=bool, default=True) parser.add_argument('--iter', type=int, default=2400000) parser.add_argument('--test_policy', choices=['Greedy', 'GreedyEpsilon'], default='GreedyEpsilon') args = parser.parse_args() args.seed = np.random.randint(0, 1000000, 1)[0] args.weights = 'models/dqn_{}_weights_{}_{}_{}.h5f'.format( args.env, args.method, args.network, args.iter) args.monitor_path = 'tmp/dqn_{}_weights_{}_{}_{}_{}'.format( args.env, args.method, args.network, args.iter, args.test_policy) if args.mode == 'train': args.monitor = False env = gym.make(args.env) if args.monitor: env = wrappers.Monitor(env, args.monitor_path) np.random.seed(args.seed) env.seed(args.seed) args.gamma = 0.99 args.learning_rate = 0.0001 args.epsilon = 0.05 args.num_iterations = 5000000 args.batch_size = 32 args.window_length = 4 args.num_burn_in = 50000 args.target_update_freq = 10000 args.log_interval = 10000 args.model_checkpoint_interval = 10000 args.train_freq = 4 args.num_actions = env.action_space.n args.input_shape = (84, 84) args.memory_max_size = 1000000 args.output = get_output_folder(args.output, args.env) args.suffix = args.method + '_' + args.network if (args.method == 'dqn'): args.enable_double_dqn = False args.enable_dueling_network = False elif (args.method == 'double'): args.enable_double_dqn = True args.enable_dueling_network = False elif (args.method == 'dueling'): args.enable_double_dqn = False args.enable_dueling_network = True else: print('Attention! Method Worng!!!') if args.test_policy == 'Greedy': test_policy = GreedyPolicy() elif args.test_policy == 'GreedyEpsilon': test_policy = GreedyEpsilonPolicy(args.epsilon) print(args) K.tensorflow_backend.set_session(get_session()) model = create_model(args.window_length, args.input_shape, args.num_actions, args.network) # we create our preprocessor, the Ataripreprocessor will only process current frame the agent is seeing. And the sequence # preprocessor will construct the state by concatenating 3 previous frames from HistoryPreprocessor and current processed frame Processor = {} Processor['Atari'] = AtariPreprocessor(args.input_shape) Processor['History'] = HistoryPreprocessor(args.window_length) ProcessorSequence = PreprocessorSequence(Processor) # construct 84x84x4 # we create our memory for saving all experience collected during training with window length 4 memory = ReplayMemory(max_size=args.memory_max_size, input_shape=args.input_shape, window_length=args.window_length) # we use linear decay greedy epsilon policy and tune the epsilon from 1 to 0.1 during the first 100w iterations and then keep using # epsilon with 0.1 to further train the network policy = LinearDecayGreedyEpsilonPolicy(GreedyEpsilonPolicy(args.epsilon), attr_name='eps', start_value=1, end_value=0.1, num_steps=1000000) # we construct our agent and use 0.99 as our discounted factor, 32 as our batch_size. We update our model for each 4 iterations. But during first # 50000 iterations, we only collect data to the memory and don't update our model. dqn = DQNAgent(q_network=model, policy=policy, memory=memory, num_actions=args.num_actions, test_policy=test_policy, preprocessor=ProcessorSequence, gamma=args.gamma, target_update_freq=args.target_update_freq, num_burn_in=args.num_burn_in, train_freq=args.train_freq, batch_size=args.batch_size, enable_double_dqn=args.enable_double_dqn, enable_dueling_network=args.enable_dueling_network) adam = Adam(lr=args.learning_rate) dqn.compile(optimizer=adam) if args.mode == 'train': weights_filename = 'dqn_{}_weights_{}.h5f'.format( args.env, args.suffix) checkpoint_weights_filename = 'dqn_' + args.env + '_weights_' + args.suffix + '_{step}.h5f' log_filename = 'dqn_{}_log_{}.json'.format(args.env, args.suffix) log_dir = '../tensorboard_{}_log_{}'.format(args.env, args.suffix) callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=args.model_checkpoint_interval) ] callbacks += [FileLogger(log_filename, interval=100)] callbacks += [ TensorboardStepVisualization(log_dir=log_dir, histogram_freq=1, write_graph=True, write_images=True) ] # start training # we don't apply action repetition explicitly since the game will randomly skip frame itself dqn.fit(env, callbacks=callbacks, verbose=1, num_iterations=args.num_iterations, action_repetition=1, log_interval=args.log_interval, visualize=True) dqn.save_weights(weights_filename, overwrite=True) dqn.evaluate(env, num_episodes=10, visualize=True, num_burn_in=5, action_repetition=1) elif args.mode == 'test': weights_filename = 'dqn_{}_weights_{}.h5f'.format( args.env, args.suffix) if args.weights: weights_filename = args.weights dqn.load_weights(weights_filename) dqn.evaluate(env, num_episodes=250, visualize=True, num_burn_in=5, action_repetition=1) # we upload our result to openai gym if args.monitor: env.close() gym.upload(args.monitor_path, api_key='sk_J62obX9PQg2ExrM6H9rvzQ')
def main(): #env = gym.make("Enduro-v0") #env = gym.make("SpaceInvaders-v0") #env = gym.make("Breakout-v0") model_name = "result-q6-qqdn" if (len(sys.argv) >= 2): model_name = sys.argv[1] if (len(sys.argv) >= 3): env = gym.make(sys.argv[2]) else: #env = gym.make("Enduro-v0") env = gym.make("SpaceInvaders-v0") #env = gym.make("Breakout-v0") #no skip frames env.frameskip = 1 input_shape = (84, 84) batch_size = 32 num_actions = env.action_space.n memory_size = 1000000 memory_burn_in_num = 50000 start_epsilon = 1 end_epsilon = 0.01 decay_steps = 1000000 target_update_freq = 10000 train_freq = 4 #How often you train the network history_size = 4 history_prep = HistoryPreprocessor(history_size) atari_prep = AtariPreprocessor(input_shape, 0, 999) numpy_prep = NumpyPreprocessor() preprocessors = PreprocessorSequence( [atari_prep, history_prep, numpy_prep]) #from left to right policy = LinearDecayGreedyEpsilonPolicy(start_epsilon, end_epsilon, decay_steps) model = create_model(history_size, input_shape, num_actions, model_name) model.summary() #plot_model(model,to_file="dueling.png") optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) loss_func = huber_loss #linear_model.compile(optimizer, loss_func) random_policy = UniformRandomPolicy(num_actions) #memory = ActionReplayMemory(1000000,4) memory = ActionReplayMemory(memory_size, 4) memory_burn_in(env, memory, preprocessors, memory_burn_in_num, random_policy) #print(reward_arr) #print(curr_state_arr) agent = DDQNAgent(model, preprocessors, memory, policy, 0.99, target_update_freq, None, train_freq, batch_size) agent.compile(optimizer, loss_func) agent.save_models() agent.fit(env, 1000000, 100000)
def main(): # noqa: D103 parser = argparse.ArgumentParser(description='Run DQN on Atari Breakout') parser.add_argument('--env', default='SpaceInvadersDeterministic-v3', help='Atari env name') parser.add_argument('-o', '--output', default='atari-v0', help='Directory to save data to') parser.add_argument('--seed', default=0, type=int, help='Random seed') parser.add_argument('--model', default='dqn', help='Q Network type to use.') parser.add_argument('--double', action='store_true') model_map = { 'linear': LinearQN, 'mlp': MLP, 'dqn': DQN, 'dueling': DuelingDQN } args = parser.parse_args() args.model = args.model.lower() if args.model not in model_map: print("Invalid model type. Valid types are", model_map.keys()) sys.exit(1) args.output = get_output_folder(args.output, args.env) # here is where you should start up a session, # create your DQN agent, create your model, etc. # then you can run your fit method. env = gym.make(args.env) monitored_env = gym.wrappers.Monitor( gym.make(args.env), args.output, video_callable=lambda i: i % EVAL_NUM_EPISODES == 0) atari = not args.env.startswith("CartPole") if atari: input_shape = (IMAGE_SIZE, IMAGE_SIZE) preprocessor = lambda: PreprocessorSequence( AtariPreprocessor(new_size=input_shape), HistoryPreprocessor(history_length=WINDOW_SIZE, max_over=True)) else: input_shape = (4, ) preprocessor = lambda: HistoryPreprocessor(history_length=WINDOW_SIZE) memory = ExperienceReplay(max_size=REPLAY_BUFFER_SIZE, window_length=WINDOW_SIZE) NUM_ACTIONS = env.action_space.n #policy = UniformRandomPolicy(num_actions=NUM_ACTIONS) #policy = GreedyEpsilonPolicy(NUM_ACTIONS, EPSILON) policy = LinearDecayGreedyEpsilonPolicy(NUM_ACTIONS, 1.0, EPSILON, NUM_ITERATIONS_LINEAR_DECAY) model = model_map[args.model](exp_name=args.output) agent = DQNAgent(q_network=model, preprocessor=preprocessor, memory=memory, policy=policy, gamma=GAMMA, target_update_freq=TARGET_UPDATE_FREQ, replay_buffer_size=REPLAY_BUFFER_SIZE, train_freq=TRAIN_FREQ, batch_size=BATCH_SIZE, output_dir=args.output, double_dqn=args.double) agent.compile(window=WINDOW_SIZE, input_shape=input_shape, num_actions=NUM_ACTIONS, model_name='q_network') signal.signal(signal.SIGINT, agent.signal_handler) signal.signal(signal.SIGTERM, agent.signal_handler) signal.signal(signal.SIGHUP, agent.signal_handler) agent.fit(env, monitored_env, num_iterations=NUM_ITERATIONS)
def testPerformance(self): """ Test to make sure each model(DQN, DDQN, DoubleQN) could be created and compiled """ #create a model of the world env = gym.make("SpaceInvaders-v0") env.frameskip = 1 #create a fake keras model input_shape = (84, 84) window = 4 num_actions = env.action_space.n model = Sequential(name="test_model") model.add( Convolution2D(filters=16, kernel_size=8, strides=4, activation='relu', input_shape=(input_shape[0], input_shape[1], window))) model.add( Convolution2D(filters=32, kernel_size=4, strides=2, activation='relu')) model.add( Convolution2D(filters=64, kernel_size=3, strides=1, activation='relu')) model.add(Flatten()) model.add(Dense(units=512, activation='relu')) model.add(Dense(units=num_actions, activation='linear')) #create loss function & optimizer optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) loss_func = huber_loss #preprocessors history_prep = HistoryPreprocessor(4) atari_prep = AtariPreprocessor(input_shape, 0, 999) numpy_prep = NumpyPreprocessor() preprocessors = PreprocessorSequence( [atari_prep, history_prep, numpy_prep]) #from left to right memory = ActionReplayMemory(100000, 4) #policy = LinearDecayGreedyEpsilonPolicy(1, 0.1,100000) policy = SamePolicy(1) #agent = DQNAgent(model, preprocessors, memory, policy,0.99, target_update_freq,None,train_freq,batch_size) dqn_agent = DQNAgent(model, preprocessors, memory, policy, 0.99, 10000, None, 4, 32) dqn_agent.compile(optimizer, loss_func) total_time = 0 times = 50 for i in range(0, times): start_time = time.time() dqn_agent.evaluate_detailed(env, 1) total_time += (time.time() - start_time) sys.stdout.write('\r{}'.format(i)) sys.stdout.flush() print("average evaluation time:{} total time:{}".format( total_time / times, total_time))