def __init__(self): self.save = True; self.sess = tf.InteractiveSession() self.agent = DeepAgent() self.D = deque() self.Holdout = deque() self.s, self.readout, h_fc1 = self.createNet(); self.s_t = self.a_t = None self.epsilon = INITIAL_EPSILON self.t = 0 self.saver = None self.a = tf.placeholder("float", [None, ACTIONS]) self.y = tf.placeholder("float", [None]) readout_action = tf.reduce_sum(tf.multiply(self.readout, self.a), reduction_indices=1) cost = tf.reduce_mean(tf.square(self.y - readout_action)) self.train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) #self.train_step = tf.train.RMSPropOptimizer(learning_rate=0.00025, decay=0.9, momentum=0.95).minimize(cost) self.max = 100 self.min = 100 self.saver = tf.train.Saver() self.sess.run(tf.initialize_all_variables()) checkpoint = tf.train.get_checkpoint_state("networks") if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights")
def build_agent(self, train=True): env_params = EnvSetup() env_params = env_params.get_params(self.env_name) self.agent = DeepAgent(self.env, epsilon=env_params["epsilon"], epsilon_min=env_params["epsilon_min"], epsilon_decay=env_params["epsilon_decay"], experiences_size=env_params["experiences_size"]) if train: self.agent.build_nn(layer_1=env_params["layer_1"], layer_2=env_params["layer_2"], learning_rate=env_params["learning_rate"])
def add_character(self): self.agent = DeepAgent( "me", # namFe "C", # avatar 0, 0, # position ) super(DQNGame, self).add_character(self.agent)
# ------------------------------------------------------------------------------------------------ # Tutorial sample #2: Run simple mission using raw XML import MalmoPython import os import sys import time sys.path.append("functions/.") from DeepAgent import DeepAgent sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) # flush print output immediately # Create default Malmo objects: agent = DeepAgent() agent_host = MalmoPython.AgentHost() print agent.actions try: agent_host.parse( sys.argv ) except RuntimeError as e: print 'ERROR:',e print agent_host.getUsage() exit(1) if agent_host.receivedArgument("help"): print agent_host.getUsage() exit(0) # -- set up the mission -- # mission_file = './mission_setup.xml' with open(mission_file, 'r') as f:
class RunEnv(object): def __init__(self, env_name, verbose=False): self.env_name = env_name self.verbose = verbose self.current_exp = deque(maxlen=1001) def train(self, episodes, upload_to_gym=False, show_video=False): env_params = EnvSetup() env_params = env_params.get_params(self.env_name) self.build_env(folder=self.env_name, show_video=show_video) self.build_agent() if self.env_name == "LunarLander-v2": self.run_lunar(episodes=episodes, gamma=env_params["gamma"], epochs=env_params["epochs"], min_experience=env_params["min_experience"], upload_to_gym=upload_to_gym) elif self.env_name == "CartPole-v0": self.run_cart_pole(episodes=episodes, gamma=env_params["gamma"], epochs=env_params["epochs"], min_experience=env_params["min_experience"], upload_to_gym=upload_to_gym) def build_env(self, folder, show_video): self.env = gym.make(self.env_name) self.env = wrappers.Monitor( self.env, folder, video_callable=lambda count: count % 100 == 0 and show_video, force=True) self.state_size = self.env.observation_space.shape[0] def build_agent(self, train=True): env_params = EnvSetup() env_params = env_params.get_params(self.env_name) self.agent = DeepAgent(self.env, epsilon=env_params["epsilon"], epsilon_min=env_params["epsilon_min"], epsilon_decay=env_params["epsilon_decay"], experiences_size=env_params["experiences_size"]) if train: self.agent.build_nn(layer_1=env_params["layer_1"], layer_2=env_params["layer_2"], learning_rate=env_params["learning_rate"]) def run_lunar(self, episodes, gamma, epochs, min_experience, upload_to_gym=False): total_land = 0 for i in range(episodes): self.current_exp = deque(maxlen=1001) # init state state = self.env.reset() state = np.reshape(state, [1, self.state_size]) # run time_t = 0 exploit = False if i > episodes - 200: # exploit last 10% of episodes exploit = True while True: time_t += 1 action = self.agent.get_action(state, force_exploit=exploit) new_state, reward, done, info = self.env.step(action) new_state = np.reshape(new_state, [1, self.state_size]) self.agent.add_experience(state, action, reward, new_state, done) self.current_exp.append( (state, action, reward, new_state, done)) if reward == 100: total_land += 1 """ repeat_good_experience = False repeat_good_run = False # repeat successful last experience if reward == 100 and repeat_good_experience: repeat = 10 print("Repeating good experience {} times".format(repeat)) for i in range(repeat): self.agent.add_experience(state, action, reward, new_state, done) # repeat successful run in memory if reward == 100 and repeat_good_run: repeat = 15 print("Repeating good full run {} times".format(repeat)) for i in range(repeat): for state, action, reward, new_state, done in self.current_exp: self.agent.add_experience(state, action, reward, new_state, done) """ state = new_state average_score = np.mean(self.env.get_episode_rewards()[-100:]) if done: print("""\n{}/{}. Exploit: {}. Time: {}. Landed: {}. C reward: {}. Last 100: {}. Final reward: {}""".format( i, episodes, exploit, time_t, total_land, round(self.env.get_episode_rewards()[-1], 2), round(np.mean(self.env.get_episode_rewards()[-100:]), 2), reward)) break if not exploit and i % 1 == 0: #print("Updating model {}".format(i)) self.agent.learn_vec(batch_size=32, gamma=gamma, epochs=epochs, min_experience=min_experience, verbose=self.verbose) if (i + 1) % 100 == 0: print("Saving model") self.agent.save_model(filename=self.env_name + "/model") # save final model self.agent.save_model(filename=self.env_name + "/model") self.agent.save_weights(self.env_name + "/model") self.env.close() if upload_to_gym: gym.upload(self.env_name, api_key="sk_9Gt38t5ATla5HL7QI8rFTA") def run_cart_pole(self, episodes, gamma, epochs, min_experience, upload_to_gym=False): for i in range(episodes): # init state state = self.env.reset() state = np.reshape(state, [1, self.state_size]) # run time_t = 0 exploit = False if i > episodes - 200: # exploit last 10% of episodes exploit = True while True: time_t += 1 action = self.agent.get_action(state, force_exploit=exploit) #action = self.env.env.action_space.sample() new_state, reward, done, info = self.env.step(action) new_state = np.reshape(new_state, [1, self.state_size]) reward = reward if not done else -10 self.agent.add_experience(state, action, reward, new_state, done) state = new_state if done: print("""\n{}/{}. Exploit: {}. Time: {}. Rewards: {}. Final reward: {}""".format( i, episodes, exploit, time_t, self.env.get_episode_rewards()[-1], reward)) break if not exploit: self.agent.learn_vec(batch_size=32, gamma=gamma, epochs=epochs, min_experience=min_experience, verbose=self.verbose) if (i + 1) % 100 == 0: print("Saving model") self.agent.save_model(filename=self.env_name + "/model") # save final model self.agent.save_model(filename=self.env_name + "/model") self.agent.save_weights(self.env_name + "/model") self.env.close() if upload_to_gym: gym.upload(self.env_name, api_key="sk_9Gt38t5ATla5HL7QI8rFTA") def test(self, episodes, upload_to_gym=False, show_video=True): print("Evaluating model") self.build_env(folder=self.env_name + "_test", show_video=show_video) self.build_agent(train=False) self.agent.load_model(self.env_name + "/model") self.agent.save_weights(self.env_name + "_test" + "/model") # run test using leaded recent model for i in range(episodes): # init state state = self.env.reset() state = np.reshape(state, [1, self.state_size]) # run time_t = 0 while True: time_t += 1 action = self.agent.get_action(state, exploit=True) new_state, reward, done, info = self.env.step(action) new_state = np.reshape(new_state, [1, self.state_size]) state = new_state if done: print("Test episode: {}/{}. Time {}".format( i, episodes, time_t)) break self.env.close() if upload_to_gym: gym.upload(self.env_name + "_test", api_key="sk_9Gt38t5ATla5HL7QI8rFTA")
print "Loading mission from %s" % mission_file mission_xml = f.read() my_mission = MalmoPython.MissionSpec(mission_xml, True) my_mission_record = MalmoPython.MissionRecordSpec() deep_learner = DeepLearner() num_repeats = 5000 kills = 0 t = 0 for i in xrange(num_repeats): prev_kills = kills t = deep_learner.t first = True deep_learner.agent = DeepAgent() deep_learner.agent.kills = kills print print 'Repeat %d of %d' % (i + 1, num_repeats) # Attempt to start a mission: max_retries = 3 for retry in range(max_retries): try: agent_host.startMission(my_mission, my_mission_record) break except RuntimeError as e: if retry == max_retries - 1: print "Error starting mission:", e exit(1) else: time.sleep(2)
class DeepLearner: def __init__(self): self.save = True; self.sess = tf.InteractiveSession() self.agent = DeepAgent() self.D = deque() self.Holdout = deque() self.s, self.readout, h_fc1 = self.createNet(); self.s_t = self.a_t = None self.epsilon = INITIAL_EPSILON self.t = 0 self.saver = None self.a = tf.placeholder("float", [None, ACTIONS]) self.y = tf.placeholder("float", [None]) readout_action = tf.reduce_sum(tf.multiply(self.readout, self.a), reduction_indices=1) cost = tf.reduce_mean(tf.square(self.y - readout_action)) self.train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) #self.train_step = tf.train.RMSPropOptimizer(learning_rate=0.00025, decay=0.9, momentum=0.95).minimize(cost) self.max = 100 self.min = 100 self.saver = tf.train.Saver() self.sess.run(tf.initialize_all_variables()) checkpoint = tf.train.get_checkpoint_state("networks") if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") def weight_variable(self, shape): initial = tf.truncated_normal(shape, stddev=0.01) return tf.Variable(initial) def bias_variable(self, shape): initial = tf.constant(0.1, shape=shape) return tf.Variable(initial) def conv2d(self, x, W, stride): return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "VALID") #def max_pool_2x2(self, x): #return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') def createNet(self): W_conv1 = self.weight_variable([filter1_dim, filter1_dim, FRAMES, filter1_depth]) #8,8,4,32 b_conv1 = self.bias_variable([filter1_depth]) #32 W_conv2 = self.weight_variable([filter2_dim, filter2_dim, filter1_depth, filter2_depth]) #4,4,32,64 b_conv2 = self.bias_variable([filter2_depth]) #64 W_conv3 = self.weight_variable([filter3_dim, filter3_dim, filter2_depth, filter3_depth]) #3,3,64,64 b_conv3 = self.bias_variable([filter3_depth]) #64 W_fc1 = self.weight_variable([7*7*filter3_depth, neurons]) #7*7*64, 512 b_fc1 = self.bias_variable([neurons]) #512 W_fc2 = self.weight_variable([neurons, ACTIONS]) #512 b_fc2 = self.bias_variable([ACTIONS]) # input layer s = tf.placeholder(tf.float32, [None, image_dim, image_dim, FRAMES]) # hidden layers h_conv1 = tf.nn.relu(self.conv2d(s, W_conv1, filter1_stride) + b_conv1) #stride 4 h_conv2 = tf.nn.relu(self.conv2d(h_conv1, W_conv2, filter2_stride) + b_conv2) #stride 2 h_conv3 = tf.nn.relu(self.conv2d(h_conv2, W_conv3, filter3_stride) + b_conv3) #stride 1 h_conv3_flat = tf.reshape(h_conv3, [-1, 7*7*filter3_depth]) h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flat, W_fc1) + b_fc1) # readout layer readout = tf.matmul(h_fc1, W_fc2) + b_fc2 return s, readout, h_fc1 def initNetwork(self,frame, ob, eval): # printing #a_file = open("logs/readout.txt", 'w') #h_file = open("logs/hidden.txt", 'w') x_t = self.agent.resize(self.agent.getPixels(frame)) #x_t = self.agent.threshold(x_t) x_t = x_t.reshape(image_dim, image_dim) r_0 = self.agent.getReward(ob) #terminal = ob[u'IsAlive'] terminal = False self.s_t = np.stack((x_t, x_t, x_t), axis=2) # saving and loading networks readout_t = self.readout.eval(feed_dict={self.s : [self.s_t]})[0] self.a_t = np.zeros([ACTIONS]) if not eval and random.random() <= self.epsilon: #if True: print("----------Random Action----------") action_index = random.randrange(ACTIONS) self.a_t[random.randrange(ACTIONS)] = 1 else: action_index = np.argmax(readout_t) self.a_t[action_index] = 1 return action_index def trainNetwork(self, frame, ob, terminal): # scale down epsilon if self.epsilon > FINAL_EPSILON and self.t > OBSERVE: self.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE # run the selected action and observe next state and reward x_t1 = self.agent.resize( self.agent.getPixels(frame)) #cv2.imwrite('messigray.png',x_t1) x_t1 = x_t1.reshape(image_dim, image_dim ,1) r_t = self.agent.getReward(ob) s_t1 = np.append(x_t1, self.s_t[:, :, :FRAMES-1], axis=2) #cv2.imwrite('messigray.png',x_t1) #cv2.imwrite('messigray1.png', np.reshape(s_t1[:,:,0], (84,84))) #cv2.imwrite('messigray2.png',np.reshape(s_t1[:,:,1], (84,84))) #cv2.imwrite('messigray3.png',np.reshape(s_t1[:,:,2], (84,84))) # store the transition in D if self.t < 2000: self.Holdout.append((s_t1)) if self.t % 1000 == 0 and self.t >= 2000: readout_batch = self.readout.eval(feed_dict = {self.s : list(self.Holdout)}) readout_batch = np.array(readout_batch) print np.mean(np.amax(readout_batch, axis=1)) file = open("qvalue.txt", "a") file.write(str(np.mean(np.amax(readout_batch, axis=1)))+ "\n") file.close() self.D.append((self.s_t, self.a_t, r_t, s_t1, terminal)) if len(self.D) > REPLAY_MEMORY: self.D.popleft() # only train if done observing if self.t > OBSERVE: # sample a minibatch to train on minibatch = random.sample(self.D, BATCH) # get the batch variables s_j_batch = [d[0] for d in minibatch] a_batch = [d[1] for d in minibatch] r_batch = [d[2] for d in minibatch] s_j1_batch = [d[3] for d in minibatch] y_batch = [] readout_j1_batch = self.readout.eval(feed_dict = {self.s : s_j1_batch}) for i in range(0, len(minibatch)): terminal = minibatch[i][4] # if terminal, only equals reward if terminal: y_batch.append(r_batch[i]) else: y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i])) # perform gradient step self.train_step.run(feed_dict = { self.y : y_batch, self.a : a_batch, self.s : s_j_batch} ) # update the old values self.s_t = s_t1 self.t += 1 # save progress every 10000 iterations if self.t % 10000 == 0: self.saver.save(self.sess, 'networks/zombie-dqn', global_step = self.t) readout_t = self.readout.eval(feed_dict={self.s : [self.s_t]})[0] self.a_t = np.zeros([ACTIONS]) if random.random() <= self.epsilon: #print("----------Random Action----------") action_index = random.randrange(ACTIONS) self.a_t[random.randrange(ACTIONS)] = 1 else: action_index = np.argmax(readout_t) self.a_t[action_index] = 1 # print info state = "" if self.t <= OBSERVE: state = "observe" elif self.t > OBSERVE and self.t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" if self.t % 100 == 0: print("TIMESTEP", self.t, "/ STATE", state, \ "/ EPSILON", self.epsilon, "/ ACTION", self.a_t, "/ REWARD", r_t, \ "/ Q_MAX %e" % np.max(readout_t)) return action_index def evalNetwork(self, frame, ob): x_t1 = self.agent.resize( self.agent.getPixels(frame)) x_t1 = x_t1.reshape(84,84,1) r_t = self.agent.getReward(ob) terminal = False s_t1 = np.append(x_t1, self.s_t[:, :, :FRAMES-1], axis=2) self.s_t = s_t1 self.t += 1 # save progress every 10000 iterations if self.t % 10000 == 0: self.saver.save(self.sess, 'networks/zombie-dqn', global_step = self.t) readout_t = self.readout.eval(feed_dict={self.s : [self.s_t]})[0] self.a_t = np.zeros([ACTIONS]) action_index = np.argmax(readout_t) self.a_t[action_index] = 1 print max(readout_t) if max(readout_t)> self.max: cv2.imwrite('hiq.png', self.agent.getPixels(frame)) self.max = max(readout_t) elif max(readout_t) < self.min: cv2.imwrite('lowq.png', self.agent.getPixels(frame)) self.min = max(readout_t) print "DONE WITH LOW" # print info state = "" if self.t <= OBSERVE: state = "observe" elif self.t > OBSERVE and self.t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" if self.t % 100 == 0: print("TIMESTEP", self.t, "/ STATE", state, \ "/ EPSILON", self.epsilon, "/ ACTION", self.a_t, "/ REWARD", r_t, \ "/ Q_MAX %e" % np.max(readout_t)) return action_index