def __init__(self):
     self.save = True;
     self.sess = tf.InteractiveSession()      
     self.agent = DeepAgent()
     self.D = deque()
     self.Holdout = deque()
     
     self.s, self.readout, h_fc1  = self.createNet();
     self.s_t = self.a_t = None
     self.epsilon = INITIAL_EPSILON
     self.t = 0
     self.saver = None
     
     self.a = tf.placeholder("float", [None, ACTIONS])
     self.y = tf.placeholder("float", [None])
     readout_action = tf.reduce_sum(tf.multiply(self.readout, self.a), reduction_indices=1)
     cost = tf.reduce_mean(tf.square(self.y - readout_action))
     self.train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)
     #self.train_step = tf.train.RMSPropOptimizer(learning_rate=0.00025, decay=0.9, momentum=0.95).minimize(cost)
     self.max = 100
     self.min  = 100
     self.saver = tf.train.Saver()
     self.sess.run(tf.initialize_all_variables())
     checkpoint = tf.train.get_checkpoint_state("networks")
     if checkpoint and checkpoint.model_checkpoint_path:
         self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
         print("Successfully loaded:", checkpoint.model_checkpoint_path)
     else:
         print("Could not find old network weights")
示例#2
0
 def build_agent(self, train=True):
     env_params = EnvSetup()
     env_params = env_params.get_params(self.env_name)
     self.agent = DeepAgent(self.env,
                            epsilon=env_params["epsilon"],
                            epsilon_min=env_params["epsilon_min"],
                            epsilon_decay=env_params["epsilon_decay"],
                            experiences_size=env_params["experiences_size"])
     if train:
         self.agent.build_nn(layer_1=env_params["layer_1"],
                             layer_2=env_params["layer_2"],
                             learning_rate=env_params["learning_rate"])
示例#3
0
 def add_character(self):
     self.agent = DeepAgent(
         "me",  # namFe
         "C",  # avatar
         0,
         0,  # position
     )
     super(DQNGame, self).add_character(self.agent)
示例#4
0
# ------------------------------------------------------------------------------------------------

# Tutorial sample #2: Run simple mission using raw XML

import MalmoPython
import os
import sys
import time

sys.path.append("functions/.")
from DeepAgent import DeepAgent

sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)  # flush print output immediately

# Create default Malmo objects:
agent = DeepAgent()
agent_host = MalmoPython.AgentHost()
print agent.actions
try:
    agent_host.parse( sys.argv )
except RuntimeError as e:
    print 'ERROR:',e
    print agent_host.getUsage()
    exit(1)
if agent_host.receivedArgument("help"):
    print agent_host.getUsage()
    exit(0)

# -- set up the mission -- #
mission_file = './mission_setup.xml'
with open(mission_file, 'r') as f:
示例#5
0
class RunEnv(object):
    def __init__(self, env_name, verbose=False):
        self.env_name = env_name
        self.verbose = verbose
        self.current_exp = deque(maxlen=1001)

    def train(self, episodes, upload_to_gym=False, show_video=False):
        env_params = EnvSetup()
        env_params = env_params.get_params(self.env_name)
        self.build_env(folder=self.env_name, show_video=show_video)
        self.build_agent()
        if self.env_name == "LunarLander-v2":
            self.run_lunar(episodes=episodes,
                           gamma=env_params["gamma"],
                           epochs=env_params["epochs"],
                           min_experience=env_params["min_experience"],
                           upload_to_gym=upload_to_gym)
        elif self.env_name == "CartPole-v0":
            self.run_cart_pole(episodes=episodes,
                               gamma=env_params["gamma"],
                               epochs=env_params["epochs"],
                               min_experience=env_params["min_experience"],
                               upload_to_gym=upload_to_gym)

    def build_env(self, folder, show_video):
        self.env = gym.make(self.env_name)
        self.env = wrappers.Monitor(
            self.env,
            folder,
            video_callable=lambda count: count % 100 == 0 and show_video,
            force=True)
        self.state_size = self.env.observation_space.shape[0]

    def build_agent(self, train=True):
        env_params = EnvSetup()
        env_params = env_params.get_params(self.env_name)
        self.agent = DeepAgent(self.env,
                               epsilon=env_params["epsilon"],
                               epsilon_min=env_params["epsilon_min"],
                               epsilon_decay=env_params["epsilon_decay"],
                               experiences_size=env_params["experiences_size"])
        if train:
            self.agent.build_nn(layer_1=env_params["layer_1"],
                                layer_2=env_params["layer_2"],
                                learning_rate=env_params["learning_rate"])

    def run_lunar(self,
                  episodes,
                  gamma,
                  epochs,
                  min_experience,
                  upload_to_gym=False):
        total_land = 0
        for i in range(episodes):
            self.current_exp = deque(maxlen=1001)
            # init state
            state = self.env.reset()
            state = np.reshape(state, [1, self.state_size])
            # run
            time_t = 0
            exploit = False
            if i > episodes - 200:
                # exploit last 10% of episodes
                exploit = True
            while True:
                time_t += 1
                action = self.agent.get_action(state, force_exploit=exploit)
                new_state, reward, done, info = self.env.step(action)
                new_state = np.reshape(new_state, [1, self.state_size])
                self.agent.add_experience(state, action, reward, new_state,
                                          done)
                self.current_exp.append(
                    (state, action, reward, new_state, done))
                if reward == 100:
                    total_land += 1
                """
                repeat_good_experience = False
                repeat_good_run = False
                # repeat successful last experience
                if reward == 100 and repeat_good_experience:
                    repeat = 10
                    print("Repeating good experience {} times".format(repeat))
                    for i in range(repeat):
                        self.agent.add_experience(state, action, reward, new_state, done)
                # repeat successful run in memory
                if reward == 100 and repeat_good_run:
                    repeat = 15
                    print("Repeating good full run {} times".format(repeat))
                    for i in range(repeat):
                        for state, action, reward, new_state, done in self.current_exp:
                            self.agent.add_experience(state, action, reward, new_state, done)
                """
                state = new_state
                average_score = np.mean(self.env.get_episode_rewards()[-100:])
                if done:
                    print("""\n{}/{}. Exploit: {}. Time: {}.
                             Landed: {}.
                             C reward: {}. Last 100: {}.
                             Final reward: {}""".format(
                        i, episodes, exploit, time_t, total_land,
                        round(self.env.get_episode_rewards()[-1], 2),
                        round(np.mean(self.env.get_episode_rewards()[-100:]),
                              2), reward))
                    break
            if not exploit and i % 1 == 0:
                #print("Updating model {}".format(i))
                self.agent.learn_vec(batch_size=32,
                                     gamma=gamma,
                                     epochs=epochs,
                                     min_experience=min_experience,
                                     verbose=self.verbose)
            if (i + 1) % 100 == 0:
                print("Saving model")
                self.agent.save_model(filename=self.env_name + "/model")
        # save final model
        self.agent.save_model(filename=self.env_name + "/model")
        self.agent.save_weights(self.env_name + "/model")
        self.env.close()
        if upload_to_gym:
            gym.upload(self.env_name, api_key="sk_9Gt38t5ATla5HL7QI8rFTA")

    def run_cart_pole(self,
                      episodes,
                      gamma,
                      epochs,
                      min_experience,
                      upload_to_gym=False):
        for i in range(episodes):
            # init state
            state = self.env.reset()
            state = np.reshape(state, [1, self.state_size])
            # run
            time_t = 0
            exploit = False
            if i > episodes - 200:
                # exploit last 10% of episodes
                exploit = True
            while True:
                time_t += 1
                action = self.agent.get_action(state, force_exploit=exploit)
                #action = self.env.env.action_space.sample()
                new_state, reward, done, info = self.env.step(action)
                new_state = np.reshape(new_state, [1, self.state_size])
                reward = reward if not done else -10
                self.agent.add_experience(state, action, reward, new_state,
                                          done)
                state = new_state
                if done:
                    print("""\n{}/{}. Exploit: {}. Time: {}.
                             Rewards: {}.
                             Final reward: {}""".format(
                        i, episodes, exploit, time_t,
                        self.env.get_episode_rewards()[-1], reward))
                    break
            if not exploit:
                self.agent.learn_vec(batch_size=32,
                                     gamma=gamma,
                                     epochs=epochs,
                                     min_experience=min_experience,
                                     verbose=self.verbose)
            if (i + 1) % 100 == 0:
                print("Saving model")
                self.agent.save_model(filename=self.env_name + "/model")
        # save final model
        self.agent.save_model(filename=self.env_name + "/model")
        self.agent.save_weights(self.env_name + "/model")
        self.env.close()
        if upload_to_gym:
            gym.upload(self.env_name, api_key="sk_9Gt38t5ATla5HL7QI8rFTA")

    def test(self, episodes, upload_to_gym=False, show_video=True):
        print("Evaluating model")
        self.build_env(folder=self.env_name + "_test", show_video=show_video)
        self.build_agent(train=False)
        self.agent.load_model(self.env_name + "/model")
        self.agent.save_weights(self.env_name + "_test" + "/model")
        # run test using leaded recent model
        for i in range(episodes):
            # init state
            state = self.env.reset()
            state = np.reshape(state, [1, self.state_size])
            # run
            time_t = 0
            while True:
                time_t += 1
                action = self.agent.get_action(state, exploit=True)
                new_state, reward, done, info = self.env.step(action)
                new_state = np.reshape(new_state, [1, self.state_size])
                state = new_state
                if done:
                    print("Test episode: {}/{}. Time {}".format(
                        i, episodes, time_t))
                    break
        self.env.close()
        if upload_to_gym:
            gym.upload(self.env_name + "_test",
                       api_key="sk_9Gt38t5ATla5HL7QI8rFTA")
    print "Loading mission from %s" % mission_file
    mission_xml = f.read()
    my_mission = MalmoPython.MissionSpec(mission_xml, True)
my_mission_record = MalmoPython.MissionRecordSpec()

deep_learner = DeepLearner()
num_repeats = 5000
kills = 0
t = 0

for i in xrange(num_repeats):

    prev_kills = kills
    t = deep_learner.t
    first = True
    deep_learner.agent = DeepAgent()
    deep_learner.agent.kills = kills
    print
    print 'Repeat %d of %d' % (i + 1, num_repeats)
    # Attempt to start a mission:
    max_retries = 3
    for retry in range(max_retries):
        try:
            agent_host.startMission(my_mission, my_mission_record)
            break
        except RuntimeError as e:
            if retry == max_retries - 1:
                print "Error starting mission:", e
                exit(1)
            else:
                time.sleep(2)
class DeepLearner:
    
    def __init__(self):
        self.save = True;
        self.sess = tf.InteractiveSession()      
        self.agent = DeepAgent()
        self.D = deque()
        self.Holdout = deque()
        
        self.s, self.readout, h_fc1  = self.createNet();
        self.s_t = self.a_t = None
        self.epsilon = INITIAL_EPSILON
        self.t = 0
        self.saver = None
        
        self.a = tf.placeholder("float", [None, ACTIONS])
        self.y = tf.placeholder("float", [None])
        readout_action = tf.reduce_sum(tf.multiply(self.readout, self.a), reduction_indices=1)
        cost = tf.reduce_mean(tf.square(self.y - readout_action))
        self.train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)
        #self.train_step = tf.train.RMSPropOptimizer(learning_rate=0.00025, decay=0.9, momentum=0.95).minimize(cost)
        self.max = 100
        self.min  = 100
        self.saver = tf.train.Saver()
        self.sess.run(tf.initialize_all_variables())
        checkpoint = tf.train.get_checkpoint_state("networks")
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print("Successfully loaded:", checkpoint.model_checkpoint_path)
        else:
            print("Could not find old network weights")
          
    def weight_variable(self, shape):
      initial = tf.truncated_normal(shape, stddev=0.01)
      return tf.Variable(initial)

    def bias_variable(self, shape):
      initial = tf.constant(0.1, shape=shape)
      return tf.Variable(initial)

    def conv2d(self, x, W, stride):
        return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "VALID")

    #def max_pool_2x2(self, x):
      #return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')


    def createNet(self):
        W_conv1 = self.weight_variable([filter1_dim, filter1_dim, FRAMES, filter1_depth]) #8,8,4,32
        b_conv1 = self.bias_variable([filter1_depth]) #32

        W_conv2 = self.weight_variable([filter2_dim, filter2_dim, filter1_depth, filter2_depth]) #4,4,32,64
        b_conv2 = self.bias_variable([filter2_depth]) #64

        W_conv3 = self.weight_variable([filter3_dim, filter3_dim, filter2_depth, filter3_depth]) #3,3,64,64
        b_conv3 = self.bias_variable([filter3_depth]) #64

        W_fc1 = self.weight_variable([7*7*filter3_depth, neurons]) #7*7*64, 512
        b_fc1 = self.bias_variable([neurons]) #512

        W_fc2 = self.weight_variable([neurons, ACTIONS]) #512
        b_fc2 = self.bias_variable([ACTIONS])

        # input layer
        s = tf.placeholder(tf.float32, [None, image_dim, image_dim, FRAMES])

        # hidden layers
        h_conv1 = tf.nn.relu(self.conv2d(s, W_conv1, filter1_stride) + b_conv1) #stride 4
        h_conv2 = tf.nn.relu(self.conv2d(h_conv1, W_conv2, filter2_stride) + b_conv2) #stride 2
        h_conv3 = tf.nn.relu(self.conv2d(h_conv2, W_conv3, filter3_stride) + b_conv3) #stride 1
        h_conv3_flat = tf.reshape(h_conv3, [-1, 7*7*filter3_depth])
        h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flat, W_fc1) + b_fc1)

        # readout layer
        readout = tf.matmul(h_fc1, W_fc2) + b_fc2
        return s, readout, h_fc1

    def initNetwork(self,frame, ob, eval):
        # printing
        #a_file = open("logs/readout.txt", 'w')
        #h_file = open("logs/hidden.txt", 'w')

        x_t = self.agent.resize(self.agent.getPixels(frame))
        #x_t = self.agent.threshold(x_t)
        x_t = x_t.reshape(image_dim, image_dim)
        
        r_0 = self.agent.getReward(ob)
        #terminal = ob[u'IsAlive']    
        terminal = False 
        
        self.s_t = np.stack((x_t, x_t, x_t), axis=2)

        # saving and loading networks

        readout_t = self.readout.eval(feed_dict={self.s : [self.s_t]})[0]
        self.a_t = np.zeros([ACTIONS])
        if not eval and random.random() <= self.epsilon:
        #if True:
            print("----------Random Action----------")
            action_index = random.randrange(ACTIONS)
            self.a_t[random.randrange(ACTIONS)] = 1
        else:
            action_index = np.argmax(readout_t)
            self.a_t[action_index] = 1
        return action_index

    
    def trainNetwork(self, frame, ob, terminal):
        # scale down epsilon
        if self.epsilon > FINAL_EPSILON and self.t > OBSERVE:
            self.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        # run the selected action and observe next state and reward
        x_t1 = self.agent.resize( self.agent.getPixels(frame))
        #cv2.imwrite('messigray.png',x_t1)
        x_t1 = x_t1.reshape(image_dim, image_dim ,1)
        
        r_t = self.agent.getReward(ob)        
        s_t1 = np.append(x_t1, self.s_t[:, :, :FRAMES-1], axis=2)
        #cv2.imwrite('messigray.png',x_t1)
        
        #cv2.imwrite('messigray1.png', np.reshape(s_t1[:,:,0], (84,84)))
        #cv2.imwrite('messigray2.png',np.reshape(s_t1[:,:,1], (84,84)))
        #cv2.imwrite('messigray3.png',np.reshape(s_t1[:,:,2], (84,84)))
        # store the transition in D
        
        if self.t < 2000:
            self.Holdout.append((s_t1))
        if self.t % 1000 == 0 and self.t >= 2000:
            readout_batch = self.readout.eval(feed_dict = {self.s : list(self.Holdout)})
            readout_batch = np.array(readout_batch)
            print np.mean(np.amax(readout_batch, axis=1))
            file = open("qvalue.txt", "a")
            file.write(str(np.mean(np.amax(readout_batch, axis=1)))+ "\n")
            file.close()

        self.D.append((self.s_t, self.a_t, r_t, s_t1, terminal))

        
        if len(self.D) > REPLAY_MEMORY:
            self.D.popleft()

        # only train if done observing
        if self.t > OBSERVE:
            # sample a minibatch to train on
            minibatch = random.sample(self.D, BATCH)

            # get the batch variables
            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]
            s_j1_batch = [d[3] for d in minibatch]
            
            y_batch = []
            readout_j1_batch = self.readout.eval(feed_dict = {self.s : s_j1_batch})
            for i in range(0, len(minibatch)):
                terminal = minibatch[i][4]
                # if terminal, only equals reward
                if terminal:
                    y_batch.append(r_batch[i])
                else:
                    y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i]))
                               
            # perform gradient step
            self.train_step.run(feed_dict = {
                self.y : y_batch,
                self.a : a_batch,
                self.s : s_j_batch}
            )

        # update the old values
        self.s_t = s_t1
        self.t += 1

        # save progress every 10000 iterations
        if self.t % 10000 == 0:
            self.saver.save(self.sess, 'networks/zombie-dqn', global_step = self.t)

        readout_t = self.readout.eval(feed_dict={self.s : [self.s_t]})[0]

        self.a_t = np.zeros([ACTIONS])
        if random.random() <= self.epsilon:
            #print("----------Random Action----------")
            action_index = random.randrange(ACTIONS)
            self.a_t[random.randrange(ACTIONS)] = 1
        else:
            action_index = np.argmax(readout_t)
            self.a_t[action_index] = 1
            
        # print info
        state = ""
        if self.t <= OBSERVE:
            state = "observe"
        elif self.t > OBSERVE and self.t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"
        if self.t % 100 == 0:
            print("TIMESTEP", self.t, "/ STATE", state, \
                "/ EPSILON", self.epsilon, "/ ACTION", self.a_t, "/ REWARD", r_t, \
                "/ Q_MAX %e" % np.max(readout_t))
        
        return action_index
        
    def evalNetwork(self, frame, ob):
        
        x_t1 = self.agent.resize( self.agent.getPixels(frame))
        x_t1 = x_t1.reshape(84,84,1)
        r_t = self.agent.getReward(ob)

        terminal = False 
        s_t1 = np.append(x_t1, self.s_t[:, :, :FRAMES-1], axis=2)
        
        self.s_t = s_t1
        self.t += 1

        # save progress every 10000 iterations
        if self.t % 10000 == 0:
            self.saver.save(self.sess, 'networks/zombie-dqn', global_step = self.t)

        readout_t = self.readout.eval(feed_dict={self.s : [self.s_t]})[0]
        self.a_t = np.zeros([ACTIONS])
        action_index = np.argmax(readout_t)
        self.a_t[action_index] = 1
        print max(readout_t)
        if max(readout_t)> self.max:
            cv2.imwrite('hiq.png', self.agent.getPixels(frame))
            self.max = max(readout_t)
        elif max(readout_t) < self.min:
            cv2.imwrite('lowq.png', self.agent.getPixels(frame))
            self.min = max(readout_t)
            print "DONE WITH LOW"
        
        
        # print info
        state = ""
        if self.t <= OBSERVE:
            state = "observe"
        elif self.t > OBSERVE and self.t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"
        if self.t % 100 == 0:
            print("TIMESTEP", self.t, "/ STATE", state, \
                "/ EPSILON", self.epsilon, "/ ACTION", self.a_t, "/ REWARD", r_t, \
                "/ Q_MAX %e" % np.max(readout_t))
                
        return action_index