def sample_one(self):
        """
        MODIFIED SAMPLING FOR TORCS!
        """
        print
        print('START PLOTTING MODULE'.center(80, '='))
        roll_distance = []
        print
        print("TORCS Experiment Start".center(80, '='))
        env = TorcsEnv(vision=self.config.vision,
                       throttle=self.config.throttle)
        try:
            ob = env.reset()
            sonar, grayscale = self.image_to_sonar(ob.img)
            sonar = np.reshape(sonar, [19])
            state = np.concatenate(
                [sonar, np.array([ob.speedX, ob.speedY, ob.speedZ])], axis=0)
            obs, states, actions, rewards,sonars,grayscales = [], [], [], [],[],[]

            done = False  #has the episode ended?
            start_time = time.time()
            while not done and (time.time() - start_time < 300):
                states.append(state)
                obs.append(ob)
                sonars.append(sonar)
                grayscales.append(grayscale)
                state = np.concatenate(
                    [sonar, np.array([ob.speedX, ob.speedY, ob.speedZ])],
                    axis=0)

                action = self.sess.run(
                    self.sampled_action,
                    feed_dict={
                        self.observation_placeholder:
                        np.reshape(state, [1, self.observation_dim])
                    })[0]
                ob, reward, done, info = env.step(action)
                sonar, grayscale = self.image_to_sonar(ob.img)
                sonar = np.reshape(sonar, [19])

                #print('Action: ', action)
                actions.append(action)
                rewards.append(reward)
                roll_distance.append(env.distance_travelled)
                #print('Roll distance: ', roll_distance)
        except:
            raise

        finally:
            env.end()  # This is for shutting down TORCS
            print("Finished TORCS session".center(80, '='))
            print('Final distance: ', roll_distance[-1], ' [m]')
            print('END PLOTTING MODULE'.center(80, '='))
            #Plot some of the frames:
            self.grayscales = grayscales
            self.sonars = sonars
            self.obs = obs
            self.actions = actions
            self.roll_distance = roll_distance
            return
Exemplo n.º 2
0
def main():
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    generator = Generator(sess, feat_dim, aux_dim, encode_dim, action_dim)
    base_model = ResNet50(weights='imagenet', include_top=False)
    feat_extractor = Model(
        input=base_model.input,
        output=base_model.get_layer('activation_40').output
    )

    try:
        generator.model.load_weights(param_path)
        print("Weight load successfully")
    except:
        print("cannot find weight")

    env = TorcsEnv(throttle=True, gear_change=False)

    print("Start driving ...")
    ob = env.reset(relaunch=True)
    feat, aux = get_state(ob, aux_dim, feat_extractor)

    encode = np.zeros((1, encode_dim), dtype=np.float32)
    encode[0, code] = 1
    print "Encode:", encode[0]

    pre_actions = np.load(pre_actions_path)["actions"]

    for i in xrange(MAX_STEP_LIMIT):
        if i < MIN_STEP_LIMIT:
            action = np.zeros(3, dtype=np.float32)
        elif i < MIN_STEP_LIMIT + PRE_STEP:
            action = pre_actions[i - MIN_STEP_LIMIT]
        else:
            action = generator.model.predict([feat, aux, encode])[0]

        ob, reward, done, _ = env.step(action)
        feat, aux = get_state(ob, aux_dim, feat_extractor)

        if i == MIN_STEP_LIMIT + PRE_STEP:
            print "Start deciding ..."

        print "Step:", i, "DistFromStart:", ob.distFromStart, \
                "TrackPos:", ob.trackPos, "Damage:", ob.damage.item(), \
                "Action: %.6f %.6f %.6f" % (action[0], action[1], action[2]), \
                "Speed:", ob.speedX * 200

        if done:
            break

    env.end()
    print("Finish.")
Exemplo n.º 3
0
def main():
    # Creating necessary directories
    track_no = 5
    experiment_name = "tensorboard-4"
    experiment_dir  = "experiment-%s/" % experiment_name
    datas_dir = experiment_dir + "datas-track-no-%d/" % track_no
    models_dir = datas_dir + "model/"

    if os.path.exists(experiment_dir) == False:
        print("%s dosen't exists" % experiment_dir)
        return

    if os.path.exists(datas_dir) == False:
        print("%s dosen't exists" % datas_dir)
        return

    if os.path.exists(models_dir) == False:
        print("%s dosen't exists" % models_dir)
        return

    state_dim = 4
    img_dim = [304, 412, 3]
    sess = tf.InteractiveSession()
    agent = Supervise(sess, state_dim, img_dim, models_dir)
    agent.load_network()

    MAX_STEP = 10000
    step = 0
    vision = True
    env = TorcsEnv(vision=vision, throttle=True, text_mode=False, track_no=track_no, random_track=False, track_range=(5, 8))
    for i in range(1):
        if np.mod(i, 3) == 0:
            ob = env.reset(relaunch=True)
        else:
            ob = env.reset()

        s_t = np.hstack((ob.speedX, ob.speedY, ob.speedZ, 0.0))
        i_t = ob.img
        # print(i_t)

        while step < MAX_STEP:
            action = agent.action(s_t, i_t)
            ob, reward, done, info = env.step([action, 0.16, 0])
            s_t = np.hstack((ob.speedX, ob.speedY, ob.speedZ, action))
            i_t = ob.img

            print("Step", step, "Action", action, "Reward", reward)
            if done == True:
                break

    env.end()
Exemplo n.º 4
0
def test():
    env = TorcsEnv(vision=True, throttle=False)
    ob = env.reset(relaunch=True)
    reward_sum = 0.0
    done = False

    count = 0
    while not done:
        act = model.predict(img_reshape(ob.img).astype('float32') / 255)
        #print(act)
        count += 1
        ob, reward, done, _ = env.step(act)
        reward_sum += reward
    env.end()
    print("Steps before crash: ", count, reward_sum)
    return count, reward_sum
Exemplo n.º 5
0
def programmatic_game(tree_program, track_name='practgt2.xml'):
    episode_count = 2
    max_steps = 100000
    window = 5

    # Generate a Torcs environment
    env = TorcsEnv(vision=False,
                   throttle=True,
                   gear_change=False,
                   track_name=track_name)

    logging.info("TORCS Experiment Start with Priors on " + track_name)
    for i_episode in range(episode_count):
        ob = env.reset(
            relaunch=True
        )  # relaunch TORCS every 3 episode because of the memory leak error
        tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY],
                   [ob.speedZ], [ob.rpm],
                   list(ob.wheelSpinVel / 100.0),
                   list(ob.track), [0, 0, 0]]
        newobs = [item for sublist in tempObs[:-1] for item in sublist]

        for j in range(max_steps):
            act_tree = tree_program.predict([newobs])
            action_prior = [act_tree[0][0], act_tree[0][1], act_tree[0][2]]

            tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY],
                       [ob.speedZ], [ob.rpm],
                       list(ob.wheelSpinVel / 100.0),
                       list(ob.track), action_prior]
            newobs = [item for sublist in tempObs[:-1] for item in sublist]

            ob, r_t, done, info = env.step(action_prior)
            if np.mod(j, 1000) == 0:
                logging.info("Episode " + str(i_episode) + " Distance " +
                             str(ob.distRaced) + " Lap Times " +
                             str(ob.lastLapTime))

            if done:
                print('Done. Steps: ', j)
                break

        env.end()  # This is for shutting down TORCS
        logging.info("Finish.")
Exemplo n.º 6
0
def playGame(train_indicator=0):  #1 means Train, 0 means simply Run
    time.sleep(1)
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor
    LRC = 0.001  #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 24  #of sensors input

    np.random.seed(1337)

    vision = False

    EXPLORE = 300000.
    episode_count = 20000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1.0
    # epsilon = 1
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)
    pre_model = load_model("weights_rescale_all-0000.hdf5")
    # x = np.array([ 4.82767379e-01,  5.92105016e-02,  3.61700505e-01,  2.74807483e-01,
    #     2.31401995e-01,  2.07236990e-01,  1.95800006e-01,  1.89892501e-01,
    #     1.84837490e-01,  1.81293502e-01,  1.77807003e-01,  1.74377009e-01,
    #     1.71005994e-01,  1.66384503e-01,  1.61247000e-01,  1.52030498e-01,
    #     1.35238498e-01,  1.11962005e-01,  8.79574940e-02,  4.76383008e-02,
    #     4.78339800e-01,  6.97819047e-01,  4.60800716e-01,  5.00754069e-01,
    #     -1.00000000e+00,  9.99979496e-01,  8.71338917e-13])
    # x_s = np.array([x, x])
    # pre_y = pre_model.predict(x_s)
    # print(x_s[0])
    # print(pre_y[0])

    #Now load the weight
    load_name = "sample_v0_40"
    print("Now we load the weight")
    try:
        actor.model.load_weights("saved/actormodel_{}.h5".format(load_name))
        critic.model.load_weights("saved/criticmodel_{}.h5".format(load_name))
        actor.target_model.load_weights(
            "saved/actormodel_{}.h5".format(load_name))
        critic.target_model.load_weights(
            "saved/criticmodel_{}.h5".format(load_name))
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    plt.figure()
    overall_scores = []
    model_name = "sample_v0"

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack(
            (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ))

        total_reward = 0.
        cur_sample = []
        attack_valid = 1
        gap = (i / 10) / 100.0
        attack_step = -1
        attack_target = 0
        for j in range(max_steps):
            # if j == 50:
            # time.sleep(0.099)
            # continue
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            # if j > 120:
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]
            if j < 20 and train_indicator:
                a_t[0][1] += 0.5
            # os.system("scrot saved_pic/{}.png".format(j))
            if j == 80:
                print("cp attack!")
                a_t[0][0] = -1.0
            if j == 83:
                os.system("scrot saved_pic/{}.png".format(j))
            #    if a_t[0][0] > 0:
            #         a_t[0][0] = -0.3
            #     else:
            #         a_t[0][0] = 0.3
            # print("%.2f"%a_t[0][0])
            # a_t[0][2] += 0.7
            # if ob.speedX > 0.6:
            # a_t[0][1] = 0
            # if(step == 60):
            # a_t[0][0] = 1.0
            # s_t_scaled = rescale_state(s_t)
            # # print(s_t[0])
            # s_t_0 = restore_state(s_t_scaled)
            # # print(s_t_0[0])
            # new_a_t = actor.model.predict(s_t_0.reshape(1, s_t_0.shape[0]))
            # s_t_scaled_list = np.array([np.copy(s_t_scaled) for val in range(21)])
            # actions = np.array([np.copy(a_t[0]) for val in range(21)])
            # for val in range(21):
            #     actions[val][0] = -1.0 + val/10.0
            # # print(actions)
            # x_0 = np.hstack((s_t_scaled_list, actions))
            # # print(x_0.shape, s_t_scaled_list.shape, actions.shape)
            # pre_y = pre_model.predict(x_0)
            # # print(x_0[0])
            # # print(pre_y[0])

            # steer_index = int(a_t[0][0]*10.0 + 10.0)
            # for pre_step in range(2):
            #     restore_new_Y = restore_states(pre_y)
            #     actions = actor.model.predict(restore_new_Y)
            #     x_step1 = np.hstack((pre_y, actions))
            #     pre_y = pre_model.predict(x_step1)

            # for index in range(21):
            #     diff = calsulate_d(pre_y[index]) - calsulate_d(pre_y[steer_index])
            #     pro = np.random.random()
            #     if diff > gap and attack_valid == 1 and pro > 0.8 and j > 50:
            #         a_t[0][0] = -1.0 + index/10.0
            #         print("adv!", diff, "pro:", pro)
            #         attack_step = j
            #         attack_target = a_t[0][0]
            #         attack_valid -= 1

            # dis_list = np.array([(calsulate_d(st) - calsulate_d(pre_y[steer_index])) for st in pre_y])
            # print("{:.2f}".format(max(dis_list)*100000))
            # print("{}".format(max(dis_list)*100000))

            # s_t_scaled = np.copy(s_t1)
            # s_t_scaled[0] = rescale_data(s_t_scaled[0], 0.5)
            # s_t_scaled[20] = rescale_data(s_t_scaled[20], 2.5)
            # s_t_scaled[21] = rescale_data(s_t_scaled[21], 0.7)
            # s_t_scaled[22] = rescale_data(s_t_scaled[22], 0.7)
            # s_t_scaled[23] = rescale_data(s_t_scaled[23], 0.7)
            # actions = actor.model.predict(s_t_scaled.reshape(1, s_t_scaled.shape[0]))
            # print(actions[0][0])

            # ob, r_t, done, info = env.step(new_a_t[0])
            ob, r_t, done, info = env.step(a_t[0])
            print "step: {} reward: {:.5f} action: {:.5f} {:.5f} {:.5f} ".format(
                j, r_t, a_t[0][0], a_t[0][1], a_t[0][2])
            # print(a_t[0][0])

            # print "{:.5f} {:.5f} {:.5f} {:.5f} {:.5f}".format(r_t, ob.speedX, ob.speedY, ob.speedZ, ob.rpm)
            # if(r_t < -50):
            #     r_t -= 10000
            #     done = True
            if j > 20 and ob.rpm <= 0.09426:
                r_t -= 1000
                done = True

            theta = 0.1
            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX,
                              ob.speedY, ob.speedZ))

            # action_states = []
            # for i in range(-5, 6):

            # s_t1_new = np.array([val + np.abs(val)*random.uniform(-1,1)*theta for val in s_t1])
            # print(np.linalg.norm(s_t1_new - s_t1))
            # s_t1 = s_t1_new

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer
            # cur_step_sample = [s_t.tolist(), a_t[0].tolist(), r_t, s_t1.tolist(), done]
            # cur_sample.append(cur_step_sample)

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train_indicator):
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            # print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss)

            step += 1
            if done:
                break

            if j > 500:
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("saved/actormodel_{}_{}.h5".format(
                    model_name, int(step / 10000)),
                                         overwrite=True)
                # with open("actormodel.json", "w") as outfile:
                #     json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("saved/criticmodel_{}_{}.h5".format(
                    model_name, int(step / 10000)),
                                          overwrite=True)
                # with open("criticmodel.json", "w") as outfile:
                #     json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")
        s = "{},{},{},{},{},{:.3f}\n".format(gap, attack_step, attack_target,
                                             i, j, total_reward)
        attack_valid = 1
        attack_step = -1
        attack_target = 0
        with open('logs/pm_adv_test.csv'.format(model_name), 'a') as the_file:
            the_file.write(s)
        overall_scores.append(total_reward)
        plt.clf()
        plt.plot(overall_scores)
        plt.savefig("train_plots/{}_{}.jpg".format(model_name,
                                                   int(step / 10000)))
        # with open('samples/{}_{:05d}.pk'.format(model_name, i), 'w') as outfile:
        # pickle.dump(cur_sample, outfile)

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Exemplo n.º 7
0
def playGame(train=0):  #1 means Train, 0 means simply Run
    load_from = "."

    save_to = os.path.join("data", "saved")
    save_thresh = 100000  # Save if total reward for the episode is more

    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor
    LRC = 0.001  #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 29  #of sensors input

    EXPLORE = 100000.
    episode_count = 2000
    max_steps = 100000

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    keras.backend.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    ou = OU().function  #Ornstein-Uhlenbeck Process
    buff = ReplayBuffer(BUFFER_SIZE)

    env = TorcsEnv(vision=False, throttle=True, gear_change=False)

    def state(ob):
        return np.hstack(
            (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ,
             ob.wheelSpinVel / 100.0, ob.rpm))

    def load_weights(dir):
        print("Loading weights from ", dir)
        try:
            actor.model.load_weights(os.path.join(dir, "actormodel.h5"))
            critic.model.load_weights(os.path.join(dir, "criticmodel.h5"))
            actor.target_model.load_weights(os.path.join(dir, "actormodel.h5"))
            critic.target_model.load_weights(
                os.path.join(dir, "criticmodel.h5"))
            print("Weight load successfully")
        except:
            print("Cannot find the weight")

    def save_weights(dir):
        if not os.path.exists(dir):
            os.makedirs(dir)

        print("Saving weights in ", dir)
        actor.model.save_weights(os.path.join(dir, "actormodel.h5"),
                                 overwrite=True)
        critic.model.save_weights(os.path.join(dir, "criticmodel.h5"),
                                  overwrite=True)

        with open(os.path.join(dir, "actormodel.json"), "w") as outfile:
            json.dump(actor.model.to_json(), outfile)

        with open(os.path.join(dir, "criticmodel.json"), "w") as outfile:
            json.dump(critic.model.to_json(), outfile)

    load_weights(load_from)
    # Generate a Torcs environment

    print("TORCS Experiment Start.")
    np.random.seed(1337)

    done = False
    step = 0
    epsilon = 1

    for episode in range(episode_count):

        print("Episode : " + str(episode) + " Replay Buffer " +
              str(buff.count()))

        ob = env.reset()
        s_t = state(ob)

        total_reward = 0.

        progress = tqdm.trange(max_steps, disable=not train)
        for _ in progress:
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train * max(epsilon, 0) * ou(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train * max(epsilon, 0) * ou(
                a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train * max(epsilon, 0) * ou(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])
            s_t1 = state(ob)

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])

            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train):
                loss += critic.model.train_on_batch([states, actions], y_t)

                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)

                actor.update_target()
                critic.update_target()

            total_reward += r_t
            s_t = s_t1

            progress.set_description("Episode %4i, TR %6.0f, loss %7.0f" %
                                     (episode, total_reward, loss))
            #print("Episode", i, "Step", step, "Action", [ "%.3f" % x for x in a_t[0]], "Reward", r_t, "Loss", loss)

            step += 1
            if done:
                break

        #print("Episode %i, TOTAL REWARD %.0f" % (episode, total_reward))

        if train and total_reward > save_thresh:
            save_weights(save_to + str(episode))
            save_thresh = min(1000000, 2 * save_thresh)

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Exemplo n.º 8
0
agent = Agent(env)

print("TORCS Experiment Start.")
for i in range(episode_count):
    print("Episode : " + str(i))

    if np.mod(i, 3) == 0:
        # Sometimes you need to relaunch TORCS because of the memory leak error
        ob = env.reset(relaunch=True)
    else:
        ob = env.reset()

    total_reward = 0.
    for j in range(max_steps):
        action = agent.act(ob)

        ob, reward, done, _ = env.step(action)
        # print(ob)
        total_reward += reward

        step += 1
        if done:
            break

    print("TOTAL REWARD @ " + str(i) + " -th Episode  :  " + str(total_reward))
    print("Total Step: " + str(step))
    print("")

env.end()  # This is for shutting down TORCS
print("Finish.")
Exemplo n.º 9
0

print("TORCS Experiment Start.")
for i in range(episode_count):
    print("Episode : " + str(i))

    if np.mod(i, 3) == 0:
        # Sometimes you need to relaunch TORCS because of the memory leak error
        ob = env.reset(relaunch=True)
    else:
        ob = env.reset()

    total_reward = 0.
    for j in range(max_steps):
        action = agent.act(ob, reward, done, vision)

        ob, reward, done, _ = env.step(action)
        #print(ob)
        total_reward += reward

        step += 1
        if done:
            break

    print("TOTAL REWARD @ " + str(i) +" -th Episode  :  " + str(total_reward))
    print("Total Step: " + str(step))
    print("")

env.end()  # This is for shutting down TORCS
print("Finish.")
Exemplo n.º 10
0
def playGame(train_indicator=0):  #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor
    LRC = 0.001  #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 24  #of sensors input

    np.random.seed(1337)

    vision = False

    EXPLORE = 300000.
    episode_count = 20000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1.0
    # epsilon = 1
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    #Now load the weight
    load_name = "sample_v0_40"
    print("Now we load the weight")
    try:
        actor.model.load_weights("saved/actormodel_{}.h5".format(load_name))
        critic.model.load_weights("saved/criticmodel_{}.h5".format(load_name))
        actor.target_model.load_weights(
            "saved/actormodel_{}.h5".format(load_name))
        critic.target_model.load_weights(
            "saved/criticmodel_{}.h5".format(load_name))
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    plt.figure()
    overall_scores = []
    model_name = "sample_v0"

    print("TORCS Experiment Start.")

    attacks = []
    for i in range(-10, 0):
        val = i / 10.0
        attacks.append([77, val])
    # for i in range(45, 55):
    #     attacks.append([i, -1.5])
    #     attacks.append([i, 1.5])
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        # if np.mod(i, 3) == 0:
        #     ob = env.reset(relaunch=True)   #relaunch TORCS every 3 episode because of the memory leak error
        # else:
        #     ob = env.reset()
        ob = env.reset()

        s_t = np.hstack(
            (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ))

        total_reward = 0.
        cur_sample = []
        for j in range(max_steps):
            # if j == 50:
            # time.sleep(0.099)
            # continue
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            # if j > 120:
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]
            if j < 20 and train_indicator:
                a_t[0][1] += 0.5
            # if j == 71:
            #     print("cp attack!")
            #     if a_t[0][0] > 0:
            #         a_t[0][0] = -0.3
            #     else:
            #         a_t[0][0] = 0.3
            # print("%.2f"%a_t[0][0])
            # a_t[0][2] += 0.7
            # if ob.speedX > 0.6:
            # a_t[0][1] = 0
            if (j == attacks[i][0]):
                print('cp attack on {} with {}'.format(attacks[i][0],
                                                       attacks[i][1]))
                a_t[0][0] = attacks[i][1]
            ob, r_t, done, info = env.step(a_t[0])
            print "step: {} reward: {:.5f} action: {:.5f} {:.5f} {:.5f} ".format(
                j, r_t, a_t[0][0], a_t[0][1], a_t[0][2])

            # print "{:.5f} {:.5f} {:.5f} {:.5f} {:.5f}".format(r_t, ob.speedX, ob.speedY, ob.speedZ, ob.rpm)
            # if(r_t < -50):
            #     r_t -= 10000
            #     done = True
            if j > 20 and ob.rpm <= 0.09426:
                r_t -= 1000
                done = True

            theta = 0.1
            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX,
                              ob.speedY, ob.speedZ))
            # s_t1_new = np.array([val + np.abs(val)*random.uniform(-1,1)*theta for val in s_t1])
            # print(np.linalg.norm(s_t1_new - s_t1))
            # s_t1 = s_t1_new

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer
            cur_step_sample = [
                s_t.tolist(), a_t[0].tolist(), r_t,
                s_t1.tolist(), done
            ]
            cur_sample.append(cur_step_sample)

            # #Do the batch update
            # batch = buff.getBatch(BATCH_SIZE)
            # states = np.asarray([e[0] for e in batch])
            # actions = np.asarray([e[1] for e in batch])
            # rewards = np.asarray([e[2] for e in batch])
            # new_states = np.asarray([e[3] for e in batch])
            # dones = np.asarray([e[4] for e in batch])
            # y_t = np.asarray([e[1] for e in batch])

            # target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)])

            # for k in range(len(batch)):
            #     if dones[k]:
            #         y_t[k] = rewards[k]
            #     else:
            #         y_t[k] = rewards[k] + GAMMA*target_q_values[k]

            # if (train_indicator):
            #     loss += critic.model.train_on_batch([states,actions], y_t)
            #     a_for_grad = actor.model.predict(states)
            #     grads = critic.gradients(states, a_for_grad)
            #     actor.train(states, grads)
            #     actor.target_train()
            #     critic.target_train()

            total_reward += r_t
            s_t = s_t1

            # print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss)

            step += 1
            if done:
                break

            if j > 200:
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("saved/actormodel_{}_{}.h5".format(
                    model_name, int(step / 10000)),
                                         overwrite=True)
                # with open("actormodel.json", "w") as outfile:
                #     json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("saved/criticmodel_{}_{}.h5".format(
                    model_name, int(step / 10000)),
                                          overwrite=True)
                # with open("criticmodel.json", "w") as outfile:
                #     json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")
        s = "{},{},{:.3f},{},{}\n".format(i, j, total_reward, attacks[i][0],
                                          attacks[i][1])
        with open('logs/attack_{}.csv'.format(model_name), 'a') as the_file:
            the_file.write(s)
        # overall_scores.append(total_reward)
        # plt.clf()
        # plt.plot(overall_scores)
        # plt.savefig("train_plots/{}_{}.jpg".format(model_name, int(step/10000)))
        # with open('samples/{}_{:05d}.pk'.format(model_name, i), 'w') as outfile:
        #     pickle.dump(cur_sample, outfile)

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Exemplo n.º 11
0
def playGame(train_indicator=1):  #1 means Train, 0 means simply Run
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 29  #of sensors input

    vision = False

    episode_count = 1
    max_steps = 1000  #100000
    done = False
    step = 0
    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, 1, TAU, LRA)
    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    #Now load the weight
    print("Now we load Actor model's weights")
    try:
        actor.model.load_weights("actormodel.h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        if np.mod(i, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                         ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))

        total_reward = 0.

        for j in range(max_steps):
            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))

            ob, r_t, done, info = env.step(a_t_original[0])

            s_t1 = np.hstack(
                (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                 ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))

            total_reward += r_t
            s_t = s_t1

            if np.mod(j, 100) == 0:
                print("Episode", i, "Step", step, "Action", a_t_original[0],
                      "Reward", r_t)

            step += 1
            if done:
                break

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Exemplo n.º 12
0
def playGame(f_diagnostics,
             train_indicator,
             agent,
             port=3101):  # 1 means Train, 0 means simply Run

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 65  #of sensors input
    env_name = 'Torcs_Env'
    save_location = "./weights/"

    # Generate a Torcs environment
    print("I have been asked to use port: ", port)
    env = TorcsEnv(vision=False, throttle=True, gear_change=False, main=1)
    ob = None
    while ob is None:
        try:
            client = snakeoil3.Client(p=port,
                                      vision=False)  # Open new UDP in vtorcs
            client.MAX_STEPS = np.inf
            client.get_servers_input(0)  # Get the initial input from torcs

            obs = client.S.d  # Get the current full-observation from torcs
            ob = env.make_observation(obs)

            s_t = np.hstack((ob.angle, ob.track, ob.trackPos, \
             ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm, ob.opponents))
        except:
            pass

    EXPLORE = total_explore
    episode_count = max_eps
    max_steps = max_steps_eps
    epsilon = epsilon_start
    done = False
    epsilon_steady_state = 0.01  # This is used for early stopping.

    totalSteps = 0
    best_reward = -100000
    running_avg_reward = 0.

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        save_indicator = 0
        early_stop = 1
        total_reward = 0.
        info = {'termination_cause': 0}
        distance_traversed = 0.
        speed_array = []
        trackPos_array = []

        print('\n\nStarting new episode...\n')
        print("Initial memory consumption: ")

        for step in range(max_steps):

            # Take noisy actions during training
            if (train_indicator == 1):
                epsilon -= 1.0 / EXPLORE
                epsilon = max(epsilon, epsilon_steady_state)
                a_t = agent.noise_action(
                    s_t, epsilon)  #Take noisy actions during training

            else:
                a_t = agent.action(s_t)

            try:
                ob, r_t, done, info = env.step(step, client, a_t, early_stop)
                if done:
                    break

                analyse_info(info, printing=False)

                s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, \
                 ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm, ob.opponents))
                distance_traversed += ob.speedX * np.cos(
                    ob.angle)  #Assuming 1 step = 1 second

                if (math.isnan(r_t)):
                    r_t = 0.0
                    for bad_r in range(50):
                        print('Bad Reward Found')
                    break  #Introduced by Anirban

            # Add to replay buffer only if training
                if (train_indicator):
                    agent.perceive(s_t, a_t, r_t, s_t1,
                                   done)  # Add experience to replay buffer

            except Exception as e:
                print("Exception caught at port " + str(i) + str(e))
                ob = None
                while ob is None:
                    try:
                        client = snakeoil3.Client(
                            p=port, vision=False)  # Open new UDP in vtorcs
                        client.MAX_STEPS = np.inf
                        client.get_servers_input(
                            0)  # Get the initial input from torcs
                        obs = client.S.d  # Get the current full-observation from torcs
                        ob = env.make_observation(obs)
                    except:
                        pass
                    continue
            total_reward += r_t
            s_t = s_t1

            # Displaying progress every 15 steps.
            if ((np.mod(step, 15) == 0)):
                print("Episode", i, "Step", step, "Epsilon", epsilon, "Action",
                      a_t, "Reward", r_t)

            totalSteps += 1
            if done:
                break

        # Saving the best model.
        running_avg_reward = running_average(running_avg_reward, i + 1,
                                             total_reward)

        if train_indicator == 1:

            #Save network after every 20 episodes and store the data
            if np.mod(i, 20) == 0:
                agent.saveNetwork(i)

        #Saving training data for client for analysis
        if train_indicator == 1 and np.mod(i, 5) == 0:
            f1 = open(str(port) + ".csv", "a+")
            client.printAnalysis(f1, i)
            f1.close()


        print("TOTAL REWARD @ " + str(i) +"-th Episode  : Num_Steps= " + str(step) + "; Max_steps= " \
         + str(max_steps)  +"; Reward= " + str(total_reward) + \
          "; Running average reward= " + str(running_avg_reward))
        print("Total Step: " + str(totalSteps))
        print("")

        print(info)
        try:
            if 'termination_cause' in info.keys(
            ) and info['termination_cause'] == 'hardReset':
                print('Hard reset by some agent')
                ob, client = env.reset(client=client, relaunch=True)
            else:
                ob, client = env.reset(client=client, relaunch=True)
        except Exception as e:
            print("Exception caught at point B at port " + str(i) + str(e))
            ob = None
            while ob is None:
                try:
                    client = snakeoil3.Client(
                        p=port, vision=False)  # Open new UDP in vtorcs
                    client.MAX_STEPS = np.inf
                    client.get_servers_input(
                        0)  # Get the initial input from torcs
                    obs = client.S.d  # Get the current full-observation from torcs
                    ob = env.make_observation(obs)
                except:
                    print("Exception caught at at point C at port " + str(i) +
                          str(e))


        s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, \
         ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm, ob.opponents))

    env.end()  # This is for shutting down TORCS
    f1.close()
    print("Finish.")
Exemplo n.º 13
0
def playGame(trainFlag=0):
    bufferLength = 50000
    #values from Google paper
    #http://pemami4911.github.io/blog/2016/08/21/ddpg-rl.html
    gamma = 0.97
    tau = 0.001
    epsilon = 1
    actorLearnRate = 0.0001
    criticLearnRate = 0.001
    episodeMax = 2500
    maxIter = 100000
    epsilonDelta = 1 / (maxIter * 2)
    batchLength = 32
    step = 0
    flag = 0
    avgSpeed = 0
    damage = 0
    totalLoss = 0
    complete = False
    actionDimensions = 3
    stateDimensions = 29

    #set TensorFlow to use GPU, add speedups
    configProto = tensor.ConfigProto()
    configProto.gpu_options.allow_growth = True
    session = tensor.Session(config=configProto)
    K.set_session(session)

    #Initialize actor and critic
    actor = SteveTheActor(actionDimensions, stateDimensions, batchLength,
                          session, actorLearnRate, tau)
    critic = SteveTheCritic(actionDimensions, stateDimensions, batchLength,
                            session, criticLearnRate, tau)
    #initialize framebuffer for replays
    frameBuffer = FrameBuffer(bufferLength)
    #launch game
    torcsEnv = TorcsEnv(vision=False, throttle=True, gear_change=False)

    #load weights from file
    try:
        if os.path.isfile("steveActor.h5"):
            actor.model.load_weights("steveActor.h5")
            actor.targetModel.load_weights("steveActor.h5")
        if os.path.isfile("steveCritic.h5"):
            critic.model.load_weights("steveCritic.h5")
            critic.targetModel.load_weights("steveCritic.h5")
    except:
        print("Error loading weight files")

    for x in range(episodeMax):
        print("Start of Ep:" + str(x) + " Buffer:" +
              str(frameBuffer.getCount()))
        #to get rid of memory leaks every few dozen launches
        if np.mod(x, 25) == 0:
            observ = torcsEnv.reset(relaunch=True)
        else:
            observ = torcsEnv.reset()

        stack = stackSensors(observ)
        avgSpeed = observ.speedX
        rewardSum = 0.0
        for y in range(maxIter):
            epsilon = epsilon - epsilonDelta

            act = np.zeros([1, actionDimensions])
            loss = 0
            actPredict = actor.model.predict(stack.reshape(1, stack.shape[0]))

            #During training, apply Orn-Uhl noise to generate variance
            if trainFlag:
                noise = calcNoise(actionDimensions, actPredict, epsilon)
                act[0][0] = actPredict[0][0] + noise[0][0]
                act[0][1] = actPredict[0][1] + noise[0][1]
                if observ.track[9] < 100 and random.random() <= 0.1:
                    #Add opposite of noise (~0.1 centered rather than 0.1)
                    #To simulate pressing brake slightly
                    #"Feeling the brake"
                    act[0][2] = actPredict[0][2] - noise[0][2]
                else:
                    act[0][2] = actPredict[0][2] + noise[0][2]
            else:
                act[0][0] = actPredict[0][0]
                act[0][1] = actPredict[0][1]
                if observ.track[9] < 100 and random.random() <= 0.1:
                    noise = calcNoise(actionDimensions, actPredict, epsilon)
                    act[0][2] = actPredict[0][2] - noise[0][2]
            #perform action based on predicted input
            #get updated state information
            observ, newReward, complete, info = torcsEnv.step(act[0])
            #stack new sensor information
            newStack = stackSensors(observ)
            #rolling average over buffer length
            avgSpeed -= avgSpeed / batchLength
            avgSpeed += observ.speedX / batchLength
            damage = observ.damage
            #add new frame to frameBuffer
            frameBuffer.addFrame(stack, act[0], newReward, newStack, complete)
            #if frameBuffer.getSize() > batchLength:
            batch = frameBuffer.getBatch(batchLength)

            state = np.asarray([i[0] for i in batch])
            actions = np.asarray([i[1] for i in batch])
            reward = np.asarray([i[2] for i in batch])
            newState = np.asarray([i[3] for i in batch])
            completeVector = np.asarray([i[4] for i in batch])
            yTrain = np.asarray([i[1] for i in batch])

            targetQVal = critic.getRewards(newState,
                                           actor.targetModel.predict(newState))
            for z in range(len(batch)):
                if not completeVector[z]:
                    yTrain[z] = reward[z] + gamma * targetQVal[z]
                else:
                    yTrain[z] = reward[z]

            if (trainFlag):
                #update loss based on critic analyzing last action/state result
                loss += critic.model.train_on_batch([state, actions], yTrain)
                #actor predicts new input based on new state
                actorGradient = actor.model.predict(state)
                #critic is updated based on actor gradient result
                gradient = critic.gradients(state, actorGradient)

                #actor trained based on critic gradient
                actor.train(state, gradient)
                actor.trainTarget()
                critic.trainTarget()

            rewardSum = rewardSum + newReward
            stack = newStack
            step += 1
            totalLoss += loss
            if complete:
                break
        if trainFlag and np.mod(x, 5) == 0:
            print("Saving Actor and Critic Models")
            try:
                actor.model.save_weights("steveActor.h5", overwrite=True)
                with open("steveActor.json", "w") as actorFile:
                    dump(actor.model.to_json(), actorFile)

                critic.model.save_weights("steveCritic.h5", overwrite=True)
                with open("steveCritic.json", "w") as criticFile:
                    dump(critic.model.to_json(), criticFile)
            except:
                print("Error saving Actor and Critic Models")
        print("***Episode:" + str(x) + " Reward Sum:" + str(rewardSum) +
              "Loss:" + str(totalLoss) + " avgSpeed:" + str(avgSpeed) +
              " damage:" + str(damage))
        print("***Steps:" + str(step))
        with open('speedwaynewresults.csv', 'a') as csvfile:
            wr = csv.writer(csvfile,
                            delimiter=' ',
                            quotechar='|',
                            quoting=csv.QUOTE_MINIMAL)
            wr.writerow([x, step, rewardSum, totalLoss, avgSpeed, damage, act])
        totalLoss = 0
        step = 0
    torcsEnv.end()
    print("Race Ended!")
Exemplo n.º 14
0
def run_ddpg(amodel,
             cmodel,
             train_indicator=0,
             seeded=1337,
             track_name='practgt2.xml'):
    OU = FunctionOU()
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  # Target Network HyperParameters
    LRA = 0.0001  # Learning rate for Actor
    LRC = 0.001  # Lerning rate for Critic
    ALPHA = 0.9

    action_dim = 3  # Steering/Acceleration/Brake
    state_dim = 29  # of sensors input

    np.random.seed(seeded)

    vision = False

    EXPLORE = 100000.
    if train_indicator:
        episode_count = 600
    else:
        episode_count = 3
    max_steps = 20000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0

    # Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  # Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision,
                   throttle=True,
                   gear_change=False,
                   track_name=track_name)

    if not train_indicator:
        # Now load the weight
        #logging.info("Now we load the weight")
        print("Now we load the weight")
        try:
            actor.model.load_weights(amodel)
            critic.model.load_weights(cmodel)
            actor.target_model.load_weights(amodel)
            critic.target_model.load_weights(cmodel)
            #logging.info(" Weight load successfully")
            print("Weight load successfully")
        except:
            #ogging.info("Cannot find the weight")
            print("Cannot find the weight")
            exit()

    #logging.info("TORCS Experiment Start.")
    print("TORCS Experiment Start.")
    best_lap = 500

    for i_episode in range(episode_count):
        print("Episode : " + str(i_episode) + " Replay Buffer " +
              str(buff.count()))
        #logging.info("Episode : " + str(i_episode) + " Replay Buffer " + str(buff.count()))
        if np.mod(i_episode, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  # relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY,
                         ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

        total_reward = 0.

        for j_iter in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])

            s_t1 = np.hstack(
                (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ,
                 ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

            buff.add(s_t, a_t[0], r_t, s_t1, done)  # Add replay buffer

            # Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if train_indicator:
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            print("Episode", i_episode, "Step", step, "Action", a_t, "Reward",
                  r_t, "Loss", loss)

            if np.mod(step, 1000) == 0:
                logging.info("Episode {}, Distance {}, Last Lap {}".format(
                    i_episode, ob.distRaced, ob.lastLapTime))
                if ob.lastLapTime > 0:
                    if best_lap < ob.lastLapTime:
                        best_lap = ob.lastLapTime

            step += 1
            if done:
                break

        if train_indicator and i_episode > 20:
            if np.mod(i_episode, 3) == 0:
                logging.info("Now we save model")
                actor.model.save_weights("ddpg_actor_weights_periodic.h5",
                                         overwrite=True)
                critic.model.save_weights("ddpg_critic_weights_periodic.h5",
                                          overwrite=True)

        print("TOTAL REWARD @ " + str(i_episode) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("Best Lap {}".format(best_lap))
        print("")
        logging.info("TOTAL REWARD @ " + str(i_episode) +
                     "-th Episode  : Reward " + str(total_reward))
        logging.info("Best Lap {}".format(best_lap))
    env.end()  # This is for shutting down TORCS
    logging.info("Finish.")
Exemplo n.º 15
0
def playGame(train_indicator=1):  #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor
    LRC = 0.001  #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 512  #of sensors input

    np.random.seed(61502)

    vision = True

    EXPLORE = 100000.
    episode_count = 600000
    max_steps = 1800
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0
    esar2 = []
    esar4 = []

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    #We insert the Deep Q Image Processing Module
    args = {
        'save_model_freq': 10000,
        'target_model_update_freq': 10000,
        'normalize_weights': True,
        'learning_rate': .00025,
        'model': None
    }

    # print(args["save_model_freq"])

    C = DeepQNetwork(512, sess, '/home/lou/DDPG-Keras-Torcs', args=args)
    # print(C)

    x, h_fc1 = C.buildNetwork('test', trainable=True, numActions=1)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    #Now load the weight
    print("Now we load the weight")
    try:
        actor.model.load_weights("actormodelIMG.h5")
        critic.model.load_weights("criticmodelIMG.h5")
        actor.target_model.load_weights("actormodel2IMG.h5")
        critic.target_model.load_weights("criticmodel2IMG.h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 500) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every 500 episode because of the memory leak error
        else:
            ob = env.reset()

        imgfinal = np.zeros((1, 128, 128, 4), dtype=np.int32)
        s_t = C.getFC7(imgfinal)

        total_reward = 0.

        imglst = []
        speed = 0
        stepreset = 0

        for j in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])

            noise_t = np.zeros([1, action_dim])

            # a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))

            a_t_original = actor.model.predict(C.getFC7(imgfinal))
            #print('ATORIGINAL', a_t_original)
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            if random.random() <= 0.05:
                print("********Now we apply the brake***********")
                noise_t[0][2] = train_indicator * max(
                    epsilon, 0) * OU.function(a_t_original[0][2], 0.2, 1.00,
                                              0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])

            imglst.append(ob.img)

            if len(imglst) == 4:
                imgcopy = imglst[:]
                imgfinal = np.stack(imgcopy)
                # print("Original stacked matrix", imgfinal)

                imgfinal = np.reshape(imgfinal, (4, 128, 128))
                # print("Reshaped stacked matrix", imgfinal)

                imgfinal = np.transpose(imgfinal, (1, 2, 0))
                # print("Transposed stacked matrix", imgfinal)

                imgfinal = np.reshape(imgfinal, (1, 128, 128, 4))
                # print("Shape of imgfinal", imgfinal.shape)

            s_t1 = C.getFC7(imgfinal)

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            # print('NEW STATES', new_states)

            # target_q_values = critic.target_model.predict([C.getFC7(imgfinal), actor.target_model.predict(C.getFC7(imgfinal))])

            # print('ACTOR TARGET MODEL PREDICT', C.getFC7(imgfinal))
            new_states = np.reshape(new_states, (-1, 512))

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])
            # print('TARGET Q VALUES', target_q_values)
            # print('NEW STATES', new_states)
            # print('ACTOR MODEL PREDICT NEW STATES', actor.target_model.predict(new_states))
            # print('REWARDS', rewards)

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train_indicator):
                states = np.reshape(states, (-1, 512))

                print('STATESSHAPE', np.shape(states))
                print('ACTIONSSHAPE', np.shape(actions))
                print('YT', np.shape(y_t))

                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1
            speed += ob.speedX * 300
            speedavg = speed / stepreset
            #print("SPEED X", ob.speedX)

            print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t,
                  "Loss", loss, "Average Speed", speedavg)
            esar = (i, step, a_t, r_t, loss, speedavg)
            esar2.append(esar)

            step += 1
            stepreset += 1

            if len(imglst) >= 4:
                del imglst[0]

            # print("Length of imglist", len(imglst))
            # print("List itself", imgfinal)

            if done:
                break

        if np.mod(i, 50) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("actormodelIMG.h5", overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("criticmodelIMG.h5", overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")

        esar3 = (i, step, total_reward, speedavg)
        esar4.append(esar3)

        if np.mod(i, 50) == 0:
            save_object(esar2, 'IntraEpisode.pkl')
            save_object(esar4, 'InterEpisode.pkl')

    env.end()  # This is for shutting down TORCS
    print("Finish.")
    print("Saving esars.")
Exemplo n.º 16
0
def programmatic_game(steer, accel, brake, track_name='practice.xml'):
    episode_count = 1
    max_steps = 10000
    window = 5

    # Generate a Torcs environment
    env = TorcsEnv(vision=False,
                   throttle=True,
                   gear_change=False,
                   track_name=track_name)

    logging.info("TORCS Experiment Start with Priors on " + track_name)

    observation_list = []
    actions_list = []

    for i_episode in range(episode_count):
        ob = env.reset(relaunch=True)
        tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY],
                   [ob.speedZ], [ob.rpm],
                   list(ob.wheelSpinVel / 100.0),
                   list(ob.track), [0, 0, 0]]
        window_list = [tempObs[:] for _ in range(window)]

        total_reward = 0
        sp = []
        lastLapTime = []

        for j in range(max_steps):
            steer_action = clip_to_range(steer.pid_execute(window_list), -1, 1)
            accel_action = clip_to_range(accel.pid_execute(window_list), 0, 1)
            brake_action = clip_to_range(brake.pid_execute(window_list), 0, 1)
            action_prior = [steer_action, accel_action, brake_action]

            observation_list.append(window_list[:])
            actions_list.append(action_prior)  #(mixed_act[:])

            tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY],
                       [ob.speedZ], [ob.rpm],
                       list(ob.wheelSpinVel / 100.0),
                       list(ob.track), action_prior]
            window_list.pop(0)
            window_list.append(tempObs[:])

            ob, r_t, done, info = env.step(action_prior)
            #if np.mod(j, 1000) == 0:

            total_reward += r_t
            sp.append(info['speed'])

            if lastLapTime == []:
                if info['lastLapTime'] > 0:
                    lastLapTime.append(info['lastLapTime'])
            elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[
                    'lastLapTime']:
                lastLapTime.append(info['lastLapTime'])

            if done:
                print('Done. Steps: ', j)
                break

        #logging.info("Episode: " + str(i_episode) + " step: " + str(j+1) + " Distance: " + str(ob.distRaced) + ' ' + str(ob.distFromStart) + " Lap Times: " + str(ob.lastLapTime))
        logging.info(" step: " + str(j + 1) + " " + str(i_episode) +
                     "-th Episode Reward: " + str(total_reward) +
                     " Ave Reward: " + str(total_reward / (j + 1)) +
                     "\n Distance: " + str(info['distRaced']) + ' ' +
                     str(info['distFromStart']) + "\n Last Lap Times: " +
                     str(info['lastLapTime']) + " Cur Lap Times: " +
                     str(info['curLapTime']) + " lastLaptime: " +
                     str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) +
                     " max sp: " + str(np.max(sp)))

        env.end()  # This is for shutting down TORCS
        logging.info("Finish.")

    return observation_list, actions_list
Exemplo n.º 17
0
    def work(self, max_episode_length, gamma, sess, coord, saver):
        """Does the actual work, as the name says ;) Runs the episodes"""

        vision = False

        self.local_AC.is_training = False
        #env = TorcsDockerEnv(self.docker_client, self.name, self.docker_port, training=True)
        env = TorcsEnv(vision=False, throttle=True,gear_change=False,port=self.docker_port)

        episode_count = sess.run(self.global_episodes)
        total_steps = 0
        print("Starting {}".format(self.name))

        with sess.as_default(), sess.graph.as_default():
            while not coord.should_stop():

                # Update with global weights, the action A3C
                sess.run(self.update_local_ops)
                episode_buffer = []
                episode_values = []
                episode_frames = []
                episode_reward = 0
                episode_step_count = 0

                # reset docker every third episode to avoid the mmemory leak
                local_episodes = 0
                if np.mod(local_episodes, 9) == 0:
                    observation = env.reset(relaunch=True)
                else:
                    observation = env.reset()
                state_t = obs_to_state(observation)

                #ob = observation
                #print(ob)

                #s_t = np.hstack((0, ob.track, ob.trackPos, ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
                #state_t = s_t


                done = False

                while not done:

                    # Get the action and apply it to the environment
                    action_t, value_t = sess.run(
                        [self.local_AC.action, self.local_AC.value],
                        feed_dict={self.local_AC.inputs: [state_t]})

                    observation, reward_t, done, _ = env.step(action_t[0][0])

                    if not done:
                        state_t1 = obs_to_state(observation)
                        episode_frames.append(state_t1)
                    else:
                        state_t1 = state_t

                    # Store the episode
                    episode_buffer.append(
                        [state_t, action_t, reward_t, state_t1, done,
                         value_t[0, 0]])
                    episode_values.append(value_t[0, 0])

                    episode_reward += reward_t

                    state_t = state_t1
                    total_steps += 1
                    episode_step_count += 1

                    if (total_steps % 30) == 0:
                        print(
                             self.name,
                            "Episode", episode_count, "Step",
                            episode_step_count, "Total_Steps",
                            total_steps, "Action", action_t[0][0],
                            "Reward", reward_t)
                        summary = tf.Summary()
                        summary.value.add(
                            tag='summary/reward_1',
                            simple_value=float(reward_t))
                        self.summary_writer.add_summary(
                            summary, total_steps)

                    self.summary_writer.flush()

                    # If the episode buffer is full, flush it and update
                    # the network weights
                    if (len(episode_buffer) == 15 and not done
                            and episode_step_count != max_episode_length-1):

                        value_t1 = sess.run(
                            self.local_AC.value,
                            feed_dict={self.local_AC.inputs: [state_t]})[0, 0]

                        (value_loss, policy_loss, gradient_norm,
                            variable_norm) = self.train(
                                episode_buffer, sess, gamma, value_t1)
                        episode_buffer = []
                        sess.run(self.update_local_ops)
            
            
                    if (done or episode_step_count >= max_episode_length):
                        break

                local_episodes += 1
                self.episode_rewards.append(episode_reward)
                self.episode_lengths.append(episode_step_count)
                self.episode_mean_values.append(
                    np.mean(episode_values))

                if len(episode_buffer) != 0:
                    # Train the netowkr use the recent episodes
                    (value_loss, policy_loss, gradient_norm,
                     variable_norm) = self.train(
                        episode_buffer, sess, gamma, 0.0)

                if episode_count != 0:
                    if (self.name == 'worker_0'):
                        saver.save(
                            sess,
                            os.path.join(self.modeldir,
                                         'model-{:d}.cptk'.format(
                                             episode_count)))

                    mean_reward = np.mean(self.episode_rewards[-5:])
                    mean_length = np.mean(self.episode_lengths[-5:])
                    mean_value = np.mean(self.episode_mean_values[-5:])

                    print(
                        "Worker", self.name, "Episode", episode_count,
                        "Reward", mean_reward, "value_Loss", value_loss,
                        "policy_loss", policy_loss)

                    summary = tf.Summary()
                    summary.value.add(
                        tag='Perf/Reward',
                        simple_value=float(mean_reward))
                    summary.value.add(
                        tag='Perf/Length',
                        simple_value=float(mean_length))
                    summary.value.add(
                        tag='Perf/Value',
                        simple_value=float(mean_value))
                    summary.value.add(
                        tag='Losses/Value Loss',
                        simple_value=float(value_loss))
                    summary.value.add(
                        tag='Losses/Policy Loss',
                        simple_value=float(policy_loss))
                    summary.value.add(
                        tag='Losses/Grad Norm',
                        simple_value=float(gradient_norm))
                    summary.value.add(
                        tag='Losses/Var Norm',
                        simple_value=float(variable_norm))

                    self.summary_writer.add_summary(
                        summary, episode_count)

                    self.summary_writer.flush()

                if self.name == 'worker_0':
                    sess.run(self.increment)
                episode_count += 1
        env.end()
Exemplo n.º 18
0
def main():

    # Creating necessary directories
    collect_track_no = 5
    experiment_name = "tensorboard-4"
    experiment_dir = "experiment-%s/" % experiment_name
    models_dir = experiment_dir + "model/"
    datas_dir = experiment_dir + "datas-track-no-%d/" % collect_track_no

    if os.path.exists(experiment_dir) == False:
        print("%s dosen't exists" % experiment_dir)
        return

    if os.path.exists(models_dir) == False:
        print("%s dosen't exists" % models_dir)
        return

    if os.path.exists(datas_dir) == False:
        os.mkdir(datas_dir)

    action_dim = 1
    state_dim = 30
    env_name = 'torcs'

    sess = tf.InteractiveSession()
    agent = ddpg(env_name, sess, state_dim, action_dim, models_dir)
    agent.load_network()

    vision = True
    env = TorcsEnv(vision=vision,
                   throttle=True,
                   text_mode=False,
                   track_no=collect_track_no,
                   random_track=False,
                   track_range=(0, 3))

    print("Collecting Start.")
    max_data_entry_count = 2000
    data_entry_count = 0
    start_time = time.time()
    i = 0
    step = 0
    try:
        file = open(datas_dir + 'state-action-scalar', 'w')
        while data_entry_count < max_data_entry_count:
            if np.mod(i, 3) == 0:
                ob = env.reset(relaunch=True)
            else:
                ob = env.reset()
            s_t = np.hstack(
                (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                 ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm, 0.0))
            pre_a_t = 0.0
            while data_entry_count < max_data_entry_count:
                a_t = agent.action(s_t)

                ob, r_t, done, info = env.step([a_t[0], 0.16, 0])

                print("Step", step, "Action", a_t, "Reward", r_t)

                s_t1 = np.hstack(
                    (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                     ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm, a_t[0]))

                image = ob.img
                if step > 20:
                    plt.imsave(
                        datas_dir + ("%d-%d.jpg" %
                                     (collect_track_no, data_entry_count)),
                        image)
                    ret = file.write(
                        "%f %f %f %f %f\n" %
                        (ob.speedX, ob.speedY, ob.speedZ, pre_a_t, a_t[0]))
                    if ret == 0:
                        print("File Write error")
                    data_entry_count += 1

                s_t = s_t1
                step += 1
                pre_a_t = a_t[0]

                if done:
                    break

            print(("TOTAL REWARD @ " + str(i) + "Collect", data_entry_count))
            print(("Total Step: " + str(step)))
            print("")

    except:
        traceback.print_exc()
        with open((datas_dir + "exception"), 'w') as file:
            file.write(str(traceback.format_exc()))

    finally:

        file.close()

        env.end()
        end_time = time.time()

        with open(datas_dir + "log", 'w') as file:
            file.write("total_step = %d\n" % step)
            file.write("total_time = %s (s)\n" % str(end_time - start_time))

        print("Finish.")
Exemplo n.º 19
0
def main():
    """main method
    
    log runtime and print it at the end
    """
    s_time = timeit.default_timer()     
    global iteration
    env = TorcsEnv(vision=False, throttle=True, gear_change=False)
    memory = ReplayBuffer()
    epsilon = 1
    train_indicator = True
    modelPATH = os.path.join('.',"models",'E0011.pt')

    q,q_target = QNet(state_dim,action_dim),QNet(state_dim,action_dim)
    q_target.load_state_dict(q.state_dict())
    mu, mu_target = MuNet(state_dim), MuNet(state_dim)
    mu_target.load_state_dict(mu.state_dict())
    steer_noise = OUN(np.zeros(1),theta = 0.6)
    accel_noise = OUN(np.zeros(1),theta = 0.6)
    mu_optimizer = optim.Adam(mu.parameters(), lr=lr_mu)
    q_optimizer  = optim.Adam(q.parameters(), lr=lr_q)

    #tensorboard writer
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir = os.path.join("logs", "ddpg_torch", current_time+'E0011t')
    writer = SummaryWriter(log_dir)
    samplestate = torch.rand(1,29)
    sampleaction = torch.rand(1,2)

    #writer.add_graph(mu,samplestate)
    writer.add_graph(q,(samplestate,sampleaction))
    writer.close

    if train_indicator ==False:
        mu = torch.load(modelPATH)
        mu.eval()
        ob = env.reset()
        score = 0
        for n_step in range(100000):
            s_t = np.hstack((ob.angle, ob.track,ob.trackPos,ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
            a_t = mu(torch.from_numpy(s_t.reshape(1,-1)).float()).detach().numpy()
            ob,r_t,done,_ = env.step(a_t[0])
            score += r_t
            if done:
                print("score:",score)
                break
        env.end()
        return 0

    for n_epi in range(max_episode):
        print("Episode : " + str(n_epi) + " Replay Buffer " + str(memory.size()))
        if np.mod(n_epi, 3) == 0:
            ob = env.reset(relaunch=True)   #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()
        a_t = np.zeros([1,action_dim])
        s_t = np.hstack((ob.angle, ob.track,ob.trackPos,ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
        score = 0
        q_value_writer(q, mu, s_t, writer, 'Episode Start Q value')
        q_value_writer(q_target, mu_target, s_t, writer, 'Episode Start target Q value')
        #t_start = timeit.default_timer()
        for n_step in range(max_step):
            #epsilon -= 1.0/EXPLORE
            a_origin = mu(torch.from_numpy(s_t.reshape(1,-1)).float())
            if train_indicator == True:#add noise for train
                # sn = max(epsilon,0)*steer_noise()
                sn = steer_noise()
                # an = max(epsilon,0)*accel_noise()
                an = accel_noise()
                a_s = a_origin.detach().numpy()[0][0] + sn
                a_t[0][0] = np.clip(a_s,-1,1) # fit in steer arange
                a_a = a_origin.detach().numpy()[0][1] + an
                a_t[0][1] = np.clip(a_a,0,1) # fit in accel arange
                #record noise movement
                if iteration%10==0:
                    writer.add_scalar('Steer noise', sn, iteration)
                    writer.add_scalar('Accel_noise', an, iteration)
            else:
                a_t = a_origin.detatch().numpy()
            ob,r_t,done,_ = env.step(a_t[0])
            score += r_t

            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
            memory.put((s_t,a_t[0],r_t,s_t1,done))
            s_temp = copy.deepcopy(s_t) # for end q value log
            s_t = s_t1

            if train_indicator and memory.size()>train_start_size:
                train(mu, mu_target, q, q_target, memory, q_optimizer, mu_optimizer,writer)
                soft_update(mu, mu_target)
                soft_update(q,  q_target)
            
            iteration+=1

            if done:
                q_value_writer(q,mu,s_temp,writer,'Episode End Q value')
                q_value_writer(q_target,mu_target,s_temp,writer,'Episode End target Q value')
                break
        #t_end = timeit.default_timer()
        
        print("TOTAL REWARD @ " + str(n_epi) +"-th Episode  : Reward " + str(score))
        print("Total Step: " + str(n_step))
        print("")
        #print('{}steps, {} time spent'.format(i,t_end-t_start))
    
    torch.save(mu,modelPATH)
    
    env.end()
    
    e_time = timeit.default_timer()
    print("Total step {} and time spent {}".format(iteration, e_time-s_time))
    def sample_path(self, num_episodes=None):
        """
    MODIFIED FOR TORCS!
    Sample path for the environment.
  
    Args:
            num_episodes:   the number of episodes to be sampled 
              if none, sample one batch (size indicated by config file)
    Returns:
        paths: a list of paths. Each path in paths is a dictionary with
            path["observation"] a numpy array of ordered observations in the path
            path["actions"] a numpy array of the corresponding actions in the path
            path["reward"] a numpy array of the corresponding rewards in the path
        total_rewards: the sum of all rewards encountered during this "path"

    """
        episode = 0
        episode_rewards = []
        episode_roll_distances = []
        paths = []
        t = 0
        i = 0
        print
        print("TORCS Experiment Start".center(80, '='))
        env = TorcsEnv(vision=self.config.vision,
                       throttle=self.config.throttle)
        #print('Num episodes', num_episodes)
        print('Using a batch size of: ', self.config.batch_size)
        try:
            while (num_episodes or t < self.config.batch_size):
                i += 1
                print('t', t, 'i', i)
                #Avoid a memory leak in TORCS by relaunching
                if np.mod(i, 10) == 0:
                    state = env.reset()
                else:
                    state = env.reset(relaunch=True)
                state = np.concatenate([
                    state.track,
                    np.array([state.speedX, state.speedY, state.speedZ])
                ],
                                       axis=0)
                states, actions, rewards = [], [], []
                episode_reward = 0

                for step in range(self.config.max_ep_len):
                    states.append(state)
                    #print('State', state)
                    action = self.sess.run(self.sampled_action,
                                           feed_dict={
                                               self.observation_placeholder:
                                               np.reshape(
                                                   states[-1],
                                                   [1, self.observation_dim])
                                           })[0]
                    state, reward, done, info = env.step(action)
                    #print('\n State track', state.track)
                    #print('\n State focus', state.focus)
                    state = np.concatenate([
                        state.track,
                        np.array([state.speedX, state.speedY, state.speedZ])
                    ],
                                           axis=0)

                    #print('State', state)
                    #print('Reward', reward)
                    #print('info', info)
                    actions.append(action)
                    rewards.append(reward)
                    episode_reward += reward
                    t += 1
                    if (done or step == self.config.max_ep_len - 1):
                        episode_rewards.append(episode_reward)
                        episode_roll_distances.append(env.distance_travelled)
                        break
                    if (not num_episodes) and t == self.config.batch_size:
                        break

                path = {
                    "observation": np.array(states),
                    "reward": np.array(rewards),
                    "action": np.array(actions)
                }
                paths.append(path)
                episode += 1
                if num_episodes and episode >= num_episodes:
                    break
        finally:
            env.end()  # This is for shutting down TORCS
            print("Finished TORCS session".center(80, '='))
        return paths, episode_rewards, episode_roll_distances
Exemplo n.º 21
0
def playGame(train_indicator=1,
             safety_constrain_flag=False):  #1 means Train, 0 means simply Run
    #initialization = 0
    episode_trained = 0
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.9999
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor
    LRC = 0.001  #Lerning rate for Critic

    action_dim = 2  #Steering/Acceleration/Brake
    state_dim = 29 + 36  #of sensors input

    np.random.seed(1337)

    vision = False

    EXPLORE = 100000.
    episode_count = 1000
    max_steps = 300
    reward = 0
    done = False
    step = 0
    epsilon = 1.0
    indicator = 0

    plt.ion()

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    #Now load the weight
    print("Now we load the weight")

    try:
        actor.model.load_weights("actormodel_following.h5")
        critic.model.load_weights("criticmodel_following.h5")
        actor.target_model.load_weights("actormodel_following.h5")
        critic.target_model.load_weights("criticmodel_following.h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    print("TORCS Experiment Start.")
    cumreward_list = []
    average_step_reward_list = []
    damage_rate_list = []
    epsilon_list = []
    results_list = []
    trackPos_list = []
    speed_list = []
    epreward_list = []
    damage_time = []

    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))
        print("Epsilon is: ", epsilon)
        if np.mod(i, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack(
            (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ,
             ob.wheelSpinVel / 100.0, ob.rpm, ob.opponents))
        epsilon = epsilon * 0.998
        total_reward = 0.
        damage_steps = 0
        for j in range(max_steps):
            loss = 0
            damage = 0
            #epsilon -= 1 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])
            if train_indicator:
                a_t_original = actor.target_model.predict(
                    s_t.reshape(1, s_t.shape[0]))
            else:
                a_t_original = actor.target_model.predict(
                    s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0.1) * OU.function2(
                a_t_original[0][0], 0.5, 0.90, 0.2)
            #noise_t[0][1] = train_indicator * max(epsilon, 0.0) * OU.function(a_t_original[0][1],  1.0 , 1.00, 0.10)
            noise_t[0][1] = train_indicator * max(epsilon, 0.1) * OU.function1(
                a_t_original[0][1], 0.9, 1.0, 0.60)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)
            '''
            if np.random.randn() < max(epsilon,0.05):
                a_t[0][0] = np.random.randn()*2-1
            else:
                a_t[0][0] = a_t_original[0][0]
            '''

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]

            a_t_primitive = Get_actions(a_t[0][0],
                                        a_t[0][1],
                                        ob,
                                        safety_constrain=safety_constrain_flag)

            ob, r_t, done, info = env.step(a_t_primitive)

            if r_t == -5.0 or r_t == -1.0:
                damage_steps += 1
                damage = 1

            trackPos_list.append(ob.trackPos)
            speed_list.append(ob.speedX)
            epreward_list.append(r_t)
            damage_time.append(damage)

            s_t1 = np.hstack(
                (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                 ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm, ob.opponents))

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train_indicator):
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t,
                  "Loss", loss)

            step += 1
            if done:
                break

        damage_rate = (float)(damage_steps / j * 100)

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("actormodel_following.h5",
                                         overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("criticmodel_following.h5",
                                          overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)
        if train_indicator:
            # Save the results
            cumreward_list.append(total_reward)
            average_step_reward_list.append(total_reward / j)
            damage_rate_list.append(damage_rate)
            epsilon_list.append(epsilon)
            sio.savemat(
                'results_overtaking.mat', {
                    'total_reward': cumreward_list,
                    'average_reward': average_step_reward_list,
                    'epsilon': epsilon_list,
                    'damage': damage_rate_list
                })
        else:
            sio.savemat(
                'info.mat', {
                    'ep_reward': epreward_list,
                    'trackPos': trackPos_list,
                    'speed': speed_list,
                    'damage_rate': damage_rate,
                    'damage_time': damage_time
                })
            print('damage rate is:', damage_rate)

        plt.figure(1)
        plt.hold(True)
        plt.subplot(511)
        plt.plot(i, total_reward, 'ro')
        plt.xlabel("Episodie")
        plt.ylabel("Episodic total reward")
        plt.subplot(512)
        plt.plot(i, total_reward / j, 'bo')
        plt.xlabel("Episodie")
        plt.ylabel("Expected reward each step")
        plt.subplot(513)
        plt.plot(i, damage_rate, 'go')
        plt.xlabel("Episodie")
        plt.ylabel("Damage rate per episode [%]")
        plt.subplot(514)
        plt.plot(i, max(epsilon, 0.1), 'yo')
        plt.xlabel("Episodie")
        plt.ylabel("epsilon")
        plt.subplot(515)
        plt.plot(i, loss / j, 'yo')
        plt.xlabel("Episodie")
        plt.ylabel("Average loss")
        plt.draw()
        plt.show()
        plt.pause(0.001)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    plt.savefig('test.png')
    print("Finish.")
Exemplo n.º 22
0
def main():
    global play, replay_buffer, q_act_net, q_target_net, step
    #sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
    tstr = time.strftime('_%H_%M_%S_')

    first_run = False  #don't use saved nets/buffer
    reset_buffer = True  #don't use saved buffer

    #dqn params
    GAMMA = 0.99
    TAU = .0001  #.0001

    #exploration noise params
    act_noise_init = 0.5  #.75 #.25
    act_noise_final = .01  # .25
    act_noise_interval = 100000
    rnd_range = 1

    #lag augmentation
    packet_lost = 0  #0.01

    #action smoothing augmentation
    lambda_spatial_q = 0
    action_smoother = .33
    action_limiter = .33

    episode_count = 10000
    max_steps = 5000
    save_in_iters = 15000
    #100000

    #start training after accumulating train_start_num samples
    train_start_num = 1 * BATCH_SIZE

    caffe.set_mode_gpu()
    caffe.set_device(0)

    #balance track samples in replay buffer
    track_balance = .9

    #n-steps dqn, steps=max_n_batches*batch size
    max_n_batches = 16

    # use n-steps dqn for this ratio, rest 1-step
    n_step_ratio = .75

    #priority buffer
    k_priority_try = 2

    if n_step_ratio > 0 and max_n_batches > 0:
        n_steps_dqn = True
    else:
        n_steps_dqn = False

    #average speed
    c_speed = 35  # 50
    #speed variance
    delta_speed = 5  # 7.5
    #switch speed in
    swith_count_min = 50

    #target frame time
    frame_rate = .4
    #fail if lag is more then
    t_delta_fail = frame_rate * 1.75

    #for rebound handling
    rebound_count_max = 5

    start_run = 50
    # for error handling
    after_start_check = 100
    max_errors = 50

    #solver for current net
    if not play:
        critic_solver = caffe.get_solver(
            current_dir + 'resnet_torcs/dqn_critic_solver.prototxt')
    if first_run:
        # target net:
        q_target_net = caffe.Net(
            current_dir + 'resnet_torcs/critic_batch_dqn.prototxt',
            current_dir + 'r18nb.caffemodel', caffe.TEST)
        # current net:
        q_act_net = caffe.Net(
            current_dir + 'resnet_torcs/critic_deploy_dqn.prototxt',
            caffe.TEST)
        if not play:
            ParamCopy(critic_solver.net.params, q_target_net.params)
        ParamCopy(q_act_net.params, q_target_net.params)
    else:
        #target net:
        q_target_net = caffe.Net(
            current_dir + 'resnet_torcs/critic_batch_dqn.prototxt',
            'qq_target.caffemodel', caffe.TEST)
        #current net:
        q_act_net = caffe.Net(
            current_dir + 'resnet_torcs/critic_deploy_dqn.prototxt',
            'q_solver.caffemodel', caffe.TEST)
        if not play:
            ParamCopy(critic_solver.net.params, q_act_net.params)
        if not play and not reset_buffer:
            print 'loading replay_buffer buffer'
            replay_buffer = load_replay()
            replay_buffer.size_reduce(BUFFER_SIZE)
            print 'replay_buffer buffer loaded'

    print 'models loaded ***************************'

    if not play:
        assert q_target_net.blobs['state'].data.shape[0] == BATCH_SIZE
        assert q_act_net.blobs['state'].data.shape[0] == 1
        assert critic_solver.net.blobs['state'].data.shape[0] == BATCH_SIZE

        assert q_target_net.blobs['state'].data.shape[1] == CHANNELS
        assert q_act_net.blobs['state'].data.shape[1] == CHANNELS
        assert critic_solver.net.blobs['state'].data.shape[1] == CHANNELS

        assert q_target_net.blobs['q_action'].data.shape[1] == DISCR_A
        assert q_act_net.blobs['q_action'].data.shape[1] == DISCR_A
        assert critic_solver.net.blobs['q_action'].data.shape[1] == DISCR_A

    max_reached_step = 150  #used for track balance
    images_history = []  #used for input image
    step = 0  #total number of simulation steps
    save_count = 0  #used for saving nets/buffer
    n_batch = 0  #used for n-steps

    q_loss = 0  #main loss
    # Generate a Torcs environment
    env = TorcsEnv(vision=True, throttle=False, observer=False)
    time_start = time.time()
    track_id = 0  #track

    #n-step temp vars
    n_steps_cont_from_prev = False
    prev_start_pos = -1
    prev_track_id = -1
    Qlast = -1
    episod_steps = 0
    n_steps_used = 0
    batches_used = 0

    #for error failure
    rest_fail = 0

    rebound_events = 0

    for i in range(episode_count):
        #balance tracks
        if episod_steps >= max_reached_step * track_balance:
            track = t_list[track_id]
            change_track(
                "/usr/local/share/games/torcs/config/raceman/quickrace.xml",
                track)
            print "Track: ", track, "track_id", track_id
            episod_steps = 0

        print("Episode : " + str(i))
        ob = env.reset(relaunch=True)

        s_t = None  #input image
        total_reward = 0.

        #for randomizing velocity
        switch_count = swith_count_min + random.randint(0, swith_count_min)

        #for handling out-of-lane
        rebound = False
        rebound_count = 0
        track_pos = 0
        error_count = 0
        act_prev = np.array([0.])
        t_delta = 0

        for j in range(max_steps):
            max_reached_step = max(max_reached_step, j)
            a_t = np.array([0.])  #action
            skip_state = False
            error_present = False

            #exploration noise params
            act_noise = act_noise_init + (
                act_noise_final - act_noise_init) * min(
                    step * 1. / act_noise_interval, 1.)
            rnd_noise = 1
            if rnd_range > 1:
                rnd_noise = int(
                    (rnd_range + 1) *
                    max(1.,
                        float(act_noise_interval - step) / act_noise_interval))

            #get action =======================================================
            if s_t is None:
                action_index = random.randrange(DISCR_A)
                print '----------Random Action---------- action_index', action_index
                a_t[0] = ind2a(action_index, DISCR_A, DELTA_A)
            else:
                a_t[0] = qchoice(q_act_net, s_t, CHANNELS, DISCR_A, DELTA_A)
                #apply exploration noise
                if not play and random.random() <= act_noise:
                    ind = a2ind(a_t[0], DISCR_A, DELTA_A)
                    r = 1
                    if rnd_noise > 1:
                        r = randint(1, rnd_noise)
                    ind += randint(-r, r)
                    ind = min(max(ind, 0), DISCR_A - 1)
                    a_t[0] = ind2a(ind, DISCR_A, DELTA_A)

            #if still no action use random
            if a_t is None:
                action_index = random.randrange(DISCR_A)
                print 'rnd action_index', action_index
                a_t[0] = ind2a(action_index, DISCR_A, DELTA_A)

            #starting area
            if j < start_run:
                a_t[0] = 0

            #action limiter
            if not play and abs(
                    a_t[0]) > DELTA_A / 2 and random.random() < action_limiter:
                ind = a2ind(a_t[0], DISCR_A, DELTA_A)
                dind = ind - DISCR_A / 2
                if dind > (DISCR_A - 1) / 4:
                    dind = (DISCR_A - 1) / 4
                if dind < -(DISCR_A - 1) / 4:
                    dind = -(DISCR_A - 1) / 4
                a_t[0] = ind2a(dind + DISCR_A / 2, DISCR_A, DELTA_A)

            #save action
            a_0_list.append(a_t)

            #fail on render delay
            if not play and t_delta > t_delta_fail and i > rest_fail + 10 and j >= after_start_check:
                error_present = True
                if error_count >= max_errors / 2:
                    print 'delta fail **************************'
                    rest_fail = i
                    break
                else:
                    error_count += 1

            #randomize speed
            if (j % switch_count and not play) == 0:
                tag_speed_rnd = c_speed - delta_speed + random.uniform(
                    0, delta_speed * 2)
            else:
                tag_speed_rnd = c_speed

            #render delay compensation
            if t_delta > frame_rate:
                tag_speed = frame_rate / t_delta * tag_speed_rnd
            else:
                tag_speed = tag_speed_rnd

            #handle out-of-lane event
            if rebound:
                rebound_count = rebound_count_max
            else:
                rebound_count = max(0, rebound_count - 1)
            if (rebound_count > rebound_count_max / 2
                    and abs(track_pos) > .7) or rebound:
                angle = -observation.angle
                if angle * track_pos > 0 and abs(angle) > .2:
                    a_t[0] = -sign(track_pos) * 4 * DELTA_A / 5
                if angle * track_pos > 0 and abs(angle) <= .2:
                    a_t[0] = -sign(track_pos) * 2 * DELTA_A / 5
                if angle * track_pos < 0 and abs(angle) <= .15:
                    a_t[0] = -sign(track_pos) * DELTA_A / 5
                if angle * track_pos < 0 and abs(angle) > .15:
                    a_t[0] = 0
                if angle * track_pos < 0 and abs(angle) >= .35:
                    a_t[0] = sign(track_pos) * DELTA_A / 5
                tag_speed = min(tag_speed, 20)
                print "############ rebound, action", a_t[
                    0], "V angle", angle, "###############"

            #smooth action
            if not play and action_smoother > 0 and random.random(
            ) < action_smoother:
                ind_prev = a2ind(act_prev[0], DISCR_A, DELTA_A)
                ind = a2ind(a_t[0], DISCR_A, DELTA_A)
                if abs(ind - ind_prev) > 1:
                    print "smooth ind", ind, "->", np.rint(.5 *
                                                           (ind_prev + ind))
                    ind = int(.5 * (ind_prev + ind))
                    a_t[0] = ind2a(ind, DISCR_A, DELTA_A)

            a_act = a_t
            #lag augemntaion
            if not play and random.random < packet_lost and t_delta < frame_rate:
                a_act = act_prev

            #===================== main enviroment step =========================================
            obs0 = time.time()
            prev_rebound = rebound
            observation, r_t, done, rebound, _ = env.step(a_act, tag_speed)
            curr_time = time.time()
            t_delta = curr_time - time_start
            time_start = curr_time
            #====================================================================================
            if rebound and not prev_rebound:
                rebound_events += 1
            print 't_delta', t_delta, "step", j, "step time", curr_time - obs0, "tag_speed_rnd", tag_speed_rnd, "rebound_events", rebound_events
            if rebound:
                r_t = 0
            if prev_rebound and r_t == 0:
                skip_state = True

            #speed failure, could be moved to gym_torcs
            if observation.speedX < .01 and j >= after_start_check and t_delta < t_delta_fail:
                skip_state = True
                error_present = True
                r_t = 0
                if error_count >= max_errors:
                    print 'speed too slow fail, speed', 300 * observation.speedX, '**************************'
                    break
                else:
                    error_count += 1

            #make state ========================================================
            image = observation.img
            images_history.append(image)
            while len(images_history) > CHANNELS + 1:
                images_history.pop(0)
            s_t1 = make_state(images_history, CHANNELS)
            track_pos = observation.trackPos

            #save stat
            reward_list.append(r_t)
            track_list.append(track_pos)
            yspeed_list.append(observation.speedY)

            #store data into replay buffer ======================================
            do_store = not play and s_t is not None and s_t1 is not None and not skip_state
            if do_store:
                print 'add data, action', a_t[0], 'reward ', r_t
                w_p = j
                replay_buffer.add(s_t, a_t, r_t, s_t1, done, w_p, track_id, -1,
                                  -1)
                print '***** stored: track_pos', track_pos, 'angle', observation.angle,\
                    'max_step', max_reached_step, 'Episode', i
            elif not play:
                print 'skipped state track_pos', track_pos, 'angle', observation.angle,\
                    'max_step', max_reached_step, 'Episode', i

            #training  ======================================
            if not play and replay_buffer.num_experiences > train_start_num:
                #get batch using n-steps if previous batch was using n-step
                use_n_steps_now = n_steps_dqn
                if n_batch >= max_n_batches:
                    use_n_steps_now = False
                    n_batch = 0
                if n_steps_cont_from_prev and use_n_steps_now and max_n_batches > 1:
                    assert prev_start_pos >= 0
                    batch, n_steps_collected, prev_start_pos, prev_track_id =\
                        replay_buffer.getBatch4Pos(BATCH_SIZE, prev_start_pos, prev_track_id)
                    n_step_continued = n_steps_collected
                else:
                    n_step_continued = False
                if n_steps_used >= n_step_ratio * batches_used and not n_step_continued:
                    use_n_steps_now = False

                #get batch if previous batch was *not* using n-step
                if not n_step_continued:
                    batch, n_steps_collected, prev_start_pos, prev_track_id =\
                        replay_buffer.getBatch(BATCH_SIZE, max_n_batches, k_priority_try, n_steps=use_n_steps_now)

                #net training =============
                q_loss, Qlast = train_on_batch(batch, q_target_net,
                                               critic_solver, DISCR_A, DELTA_A,
                                               BATCH_SIZE, GAMMA,
                                               n_steps_collected,
                                               n_step_continued, Qlast,
                                               lambda_spatial_q)

                #update n-step vars
                if n_steps_collected:
                    n_batch += 1
                    n_steps_used += 1
                else:
                    n_batch = 0
                batches_used += 1
                n_steps_cont_from_prev = n_steps_collected and prev_start_pos >= 0

                # target update ==============
                SoftUpdate(q_target_net.params, critic_solver.net.params, TAU)
                ParamCopy(q_act_net.params, critic_solver.net.params)
                save_count += 1

            #save loss
            if not play:
                q_loss_list.append(q_loss)

            #update local vars
            s_t = s_t1
            act_prev = a_t
            if done:
                s_t = None
            if not error_present:
                error_count = max(0, error_count - 1)
            total_reward += r_t
            episod_steps += 1
            step += 1
            if done:
                break

        #save nets and buffer
        if not play and save_count >= save_in_iters:
            print "start save", save_count, step
            save_count = 0
            save_nets(q_act_net, q_target_net, step, replay_buffer)
            save_state(a_0_list, a_1_list, q_loss_list, reward_list,
                       track_list, yspeed_list, tstr + str(step))
        track_id = (track_id + 1) % len(t_list)
        print("TOTAL REWARD @ " + str(i) + " -th Episode  :  " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")
    print("Finishing torcs.")
    env.end()  # This is for shutting down TORCS
    #save nets and buffer
    if not play:
        save_state(a_0_list, a_1_list, q_loss_list, reward_list, track_list,
                   yspeed_list, tstr + str(step))
        save_nets(q_act_net, q_target_net, step, replay_buffer, "_finished")
    print 'Finish'
Exemplo n.º 23
0
def playGame(train_indicator=is_training):  # 1 means Train, 0 means simply Run

    action_dim = 3  # Steering/Acceleration/Brake
    state_dim = 29  # of sensors input
    env_name = 'Torcs_Env'
    agent = DDPG(env_name, state_dim, action_dim)

    # Generate a Torcs environment
    vision = False
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    EXPLORE = total_explore
    episode_count = max_eps
    max_steps = max_steps_eps
    epsilon = epsilon_start
    done = False

    step = 0
    best_reward = -100000

    print("TORCS Experiment Start.")
    for i in range(episode_count):
        ##Occasional Testing
        if ((np.mod(i, 10) == 0) and (i > 20)):
            train_indicator = 0
        else:
            train_indicator = is_training

        # relaunch TORCS every 3 episode because of the memory leak error
        if np.mod(i, 3) == 0:
            ob = env.reset(relaunch=True)
        else:
            ob = env.reset()

        # Early episode annealing for out of track driving and small progress
        # During early training phases - out of track and slow driving is allowed as humans do ( Margin of error )
        # As one learns to drive the constraints become stricter

        random_number = random.random()
        eps_early = max(epsilon, 0.10)
        if (random_number < (1.0 - eps_early)) and (train_indicator == 1):
            early_stop = 1
        else:
            early_stop = 0
        print("Episode : " + str(i) + " Replay Buffer " + str(agent.replay_buffer.count()) + ' Early Stopping: ' + str(
            early_stop) + ' Epsilon: ' + str(eps_early) + ' RN: ' + str(random_number))

        # Initializing the first state
        s_t = np.hstack(
            (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))

        # Counting the total reward and total steps in the current episode
        total_reward = 0.
        step_eps = 0.

        for j in range(max_steps):

            # Take noisy actions during training
            if (train_indicator):
                epsilon -= 1.0 / EXPLORE
                epsilon = max(epsilon, 0.1)
                a_t = agent.noise_action(s_t, epsilon)  #输入状态s得到动作的Q值
            else:
                a_t = agent.action(s_t)

            # ob, r_t, done, info = env.step(a_t[0],early_stop)
            ob, r_t, done, info = env.step(a_t, early_stop) #  得到游戏反馈
            s_t1 = np.hstack(
                (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))

            # Add to replay buffer only if training (Is it necessay - don't think so)
            if (train_indicator):
                agent.perceive(s_t, a_t, r_t, s_t1, done)

            # Cheking for nan rewards
            if (math.isnan(r_t)):
                r_t = 0.0
                for bad_r in range(50):
                    print('Bad Reward Found')

            total_reward += r_t
            s_t = s_t1

            # Displaying progress every 15 steps.
            if ((np.mod(step, 15) == 0)):
                print("Episode", i, "Step", step_eps, "Epsilon", epsilon, "Action", a_t, "Reward", r_t)

            step += 1
            step_eps += 1
            if done:
                break

        # Saving the best model.
        if total_reward >= best_reward:
            if (train_indicator == 1):
                print("Now we save model with reward " + str(total_reward) + " previous best reward was " + str(
                    best_reward))
                best_reward = total_reward
                agent.saveNetwork()

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " + str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Exemplo n.º 24
0
def playGame(checkpoints=None,
             train_indicator=1,
             eps=1.0):  #1 means Train, 0 means simply Run
    BUFFER_SIZE = 40000
    BATCH_SIZE = 16
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.01  #Learning rate for Actor
    LRC = 0.05  #Lerning rate for Critic

    vision = True
    action_dim = 3  #Steering/Acceleration/Brake

    if vision:
        state_dim = (64, 64, 3)  #of sensors input
    else:
        state_dim = 29
    np.random.seed(1337)

    EXPLORE = 1000000.
    episode_count = 2000
    max_steps = 8000000
    reward = 0
    done = False
    step = 0
    epsilon = eps
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)
    summary_writer = tf.train.SummaryWriter('logs', graph_def=sess.graph_def)
    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA,
                         vision, summary_writer)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC,
                           vision)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer
    history = History()

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)
    log_file = open('train_log.log', 'w')
    #Now load the weight
    print("Now we load the weight")
    try:
        actor.model.load_weights("actormodel_{}.h5".format(checkpoints))
        critic.model.load_weights("criticmodel_{}.h5".foramt(checkpoints))
        actor.target_model.load_weights("actormodel_{}.h5".format(checkpoints))
        critic.target_model.load_weights(
            "criticmodel_{}.h5".format(checkpoints))
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    print("TORCS Experiment Start.")
    max_reward = 0
    min_reward = 0

    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        if vision:
            history.fill((ob.img))
            s_t = history.get()
        else:
            s_t = np.hstack(
                (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                 ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))

        total_reward = 0.
        total_damage = 0.
        for j in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            if vision:
                a_t_original = actor.model.predict(
                    s_t.reshape((-1, ) + state_dim))
            else:
                a_t_original = actor.model.predict(s_t.reshape(
                    1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.30, 0.30)
            noise_t[0][1] = 0.1 + train_indicator * max(
                epsilon, 0) * OU.function(a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])
            damage = ob.damage

            if vision:
                last_s_t = history.get().copy()
                history.add((ob.img))
                next_s_t = history.get().copy()
                if np.mod(step, 4) == 0:
                    buff.add(last_s_t, a_t[0], r_t, next_s_t,
                             done)  #Add replay buffer
                s_t1 = history.get()
            else:
                s_t1 = np.hstack(
                    (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                     ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))
                buff.add(s_t, a_t[0], r_t, s_t1, done)

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            if vision:
                target_q_values = critic.target_model.predict([
                    new_states.reshape((-1, ) + state_dim),
                    actor.target_model.predict(new_states).reshape(
                        (-1, ) + (action_dim, ))
                ])
            else:
                target_q_values = critic.target_model.predict(
                    [new_states,
                     actor.target_model.predict(new_states)])
            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if train_indicator and buff.count() >= 1000:
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)

                actor.target_train()
                critic.target_train()

            total_reward += r_t
            total_damage += damage
            s_t = s_t1

            print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t,
                  "Loss", loss)

            step += 1
            if done:
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("actormodel_{}.h5".format(i),
                                         overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("criticmodel_{}.h5".format(i),
                                          overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)
        max_reward = max(max_reward, total_reward)
        min_reward = min(min_reward, total_reward)
        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward) + "  EPS " + str(epsilon))
        print("Total Step: " + str(step) + ' Max: ' + str(max_reward) +
              ' Min: ' + str(min_reward))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
def signal_handler(signal, frame):
    print('You pressed Ctrl+C!')
    # Generate a Torcs environment
    env = TorcsEnv(vision=False, throttle=True, gear_change=False)
    env.end()
    sys.exit(0)
Exemplo n.º 26
0
def playGame(train_indicator=0):  #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor
    LRC = 0.001  #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 29  #of sensors input

    np.random.seed(1337)

    vision = False

    EXPLORE = 100000.
    episode_count = 2000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    # Now load the weight
    # print("Now we load the weight")
    # try:
    #     actor.model.load_weights("actormodel.h5")
    #     critic.model.load_weights("criticmodel.h5")
    #     actor.target_model.load_weights("actormodel.h5")
    #     critic.target_model.load_weights("criticmodel.h5")
    #     print("Weight load successfully")
    # except:
    #     print("Cannot find the weight")

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                         ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))
        print ob.track

        total_reward = 0.
        stucked = 0
        for j in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            if random.random() <= 0.1:
                print("********Now we apply the brake***********")
                noise_t[0][2] = train_indicator * max(
                    epsilon, 0) * OU.function(a_t_original[0][2], 0.2, 1.00,
                                              0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])

            s_t1 = np.hstack(
                (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                 ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train_indicator):
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t,
                  "Loss", loss)

            step += 1
            if done:
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("actormodel.h5", overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("criticmodel.h5", overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Exemplo n.º 27
0
print('Collecting data...')
for i in range(steps):
    if i == 0:
        act = np.array([0.0])
    else:
        act = get_teacher_action(ob)

    if i%100 == 0:
        print(i)
    ob, reward, done, _ = env.step(act)
    img_list.append(ob.img)
    action_list.append(act)
    reward_list.append(np.array([reward]))

env.end()

print('Packing data into arrays...')
for img, act, rew in zip(img_list, action_list, reward_list):
    images_all = np.concatenate([images_all, img_reshape(img)], axis=0)
    actions_all = np.concatenate([actions_all, np.reshape(act, [1,action_dim])], axis=0)
    rewards_all = np.concatenate([rewards_all, rew], axis=0)

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.optimizers import Adam

#model from https://github.com/fchollet/keras/blob/master/examples/cifar10_cnn.py
model = Sequential()
Exemplo n.º 28
0
def playGame(train_indicator=1):    #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001     #Target Network HyperParameters
    LRA = 0.0001    #Learning rate for Actor
    LRC = 0.001     #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 29  #of sensors input

    np.random.seed(1337)

    vision = False

    EXPLORE = 100000.
    episode_count = 2000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)    #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True,gear_change=False)

    #Now load the weight
    print("Now we load the weight")
    try:
        actor.model.load_weights("actormodel.h5")
        critic.model.load_weights("criticmodel.h5")
        actor.target_model.load_weights("actormodel.h5")
        critic.target_model.load_weights("criticmodel.h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(relaunch=True)   #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
     
        total_reward = 0.
        for j in range(max_steps):
            loss = 0 
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1,action_dim])
            noise_t = np.zeros([1,action_dim])
            
            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0],  0.0 , 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1],  0.5 , 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1 , 1.00, 0.05)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])

            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
        
            buff.add(s_t, a_t[0], r_t, s_t1, done)      #Add replay buffer
            
            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)])  
           
            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA*target_q_values[k]
       
            if (train_indicator):
                loss += critic.model.train_on_batch([states,actions], y_t) 
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            if np.mod(step, 30) == 0:
                print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss)
        
            step += 1
            if done:
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("actormodel.h5", overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("criticmodel.h5", overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) +"-th Episode  : Reward " + str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Exemplo n.º 29
0
def playGame(train_indicator=1):    # 1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000  # 缓存能力,网络储存能力
    BATCH_SIZE = 32  # 批尺寸,一次处理样本数
    GAMMA = 0.99  # 折扣系数
    TAU = 0.001     # Target Network HyperParameters 目标网络超系数
    LRA = 0.0001    # Learning rate for Actor Actor网络学习率
    LRC = 0.001     # Lerning rate for Critic Critic网络学习率

    action_dim = 3  # Steering/Acceleration/Brake 加速/转向/刹车
    state_dim = 29  # of sensors input 29个传感器输入

    np.random.seed(1337)  # 随机数种子,如果使用相同的数字,则每次产生的随机数相同,应该是定义了一个随机的初始值。

    vision = False

    EXPLORE = 100000.
    episode_count = 2000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0

    # Tensorflow GPU 管理策略,此处使用动态内存申请策略
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    # 硬性限制GPU使用率为0.4
    # config.gpu_options.per_process_gpu_memory_fraction = 0.4
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)    # Create replay buffer

    #  Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True,gear_change=False)

    # Now load the weight
    print("Now we load the weight")
    try:
        actor.model.load_weights("actormodel.h5")
        critic.model.load_weights("criticmodel.h5")
        actor.target_model.load_weights("actormodel.h5")
        critic.target_model.load_weights("criticmodel.h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    theTime = datetime.datetime.now()  # 获取系统当前时间
    theTime = theTime.strftime('%y-%m-%d_%H:%M:%S')  # 转换为字符串形式作为CSV文件头
    folder_path = "practise_progress/" + theTime + "/"  # 只适用于Linux系统
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print("folder created")
    else:
        print("folder existed")

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(relaunch=True)   # relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
     
        total_reward = 0.

        csvfileHeader = "practise_progress/" + theTime + "/" + " Episode " + str(i) + ".csv"
        fileHeader = ["Step", "TrackPos", "SpeedX", "SpeedY", "SpeedZ",
                      "Action_Steering", "Action_Acceleration", "Action_Brake", "Reward", "Loss"]
        csvFile = open(csvfileHeader, "w")
        writer = csv.writer(csvFile)
        writer.writerow(fileHeader)

        for j in range(max_steps):
            loss = 0 
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])
            
            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0],  0.0 , 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1],  0.5 , 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1 , 1.00, 0.05)

            # The following code do the stochastic brake
            # if random.random() <= 0.1:
            #     print("********Now we apply the brake***********")
            #     noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])

            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
        
            buff.add(s_t, a_t[0], r_t, s_t1, done)      # Add replay buffer
            
            # Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)])  
           
            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA*target_q_values[k]
       
            if (train_indicator):
                loss += critic.model.train_on_batch([states,actions], y_t) 
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            csvData = [step, ob.trackPos, ob.speedX * 300, ob.speedY * 300, ob.speedZ * 300,
                       a_t[0, 0], a_t[0, 1], a_t[0, 2], r_t, loss]
            """        参数记录
                       轮次  步骤计数  车辆位置  X轴速度  Y轴速度  Z轴速度
                       加速输出  转向输出  刹车输出  回报  损失函"""
            writer.writerow(csvData)
            print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss)
            step += 1
            if done:
                csvFile.close()
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("actormodel.h5", overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("criticmodel.h5", overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)



        print("TOTAL REWARD @ " + str(i) +"-th Episode  : Reward " + str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  #  This is for shutting down TORCS
    print("Finish.")
def preTrain(): # train the NN of actor and ciritc using existing rules
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001     #Target Network HyperParameters
    LRA = 0.0001    #Learning rate for Actor
    LRC = 0.001     #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 29  #of sensors input

    np.random.seed(1337)

    vision = False
    episode_count = 2000
    max_steps = 100000
    reward = 0
    done = False
    step = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)    #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True,gear_change=False)
    # Generate a driver
    driver = DriverExample()

    #Now load the weight
    print("Now we load the weight")
    try:
        actor.model.load_weights("pre_actormodel.h5")
        critic.model.load_weights("pre_criticmodel.h5")
        actor.target_model.load_weights("pre_actormodel.h5")
        critic.target_model.load_weights("pre_criticmodel.h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(relaunch=True)   #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
     
        total_reward = 0.
        for j in range(max_steps):
            loss_actor = 0
            loss_critic = 0
            a_t = np.zeros([1,action_dim])
            
            # the driver produce the actions
            a_t = driver.action(s_t.reshape(state_dim, ))

            ob, r_t, done, info = env.step(a_t)

            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
        
            buff.add(s_t, a_t, r_t, s_t1, done)      #Add replay buffer
            
            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)])  
           
            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA*target_q_values[k]
            """
            if (train_indicator == 1):
                loss += critic.model.train_on_batch([states,actions], y_t) 
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()
            """
            loss_actor += actor.model.train_on_batch(states, actions) # train actor
            loss_critic += critic.model.train_on_batch([states,actions], y_t) # train critic
            actor.target_train()
            critic.target_train()

            total_reward += r_t
            s_t = s_t1
        
            print("Episode", i, "Step", step, ": ")
            print("Action", a_t, "Reward", r_t)
            print("loss_actor", loss_actor, "loss_critic", loss_critic)
        
            step += 1

            if np.mod(step, 100) == 0:
                print("Now we save model")
                actor.model.save_weights("pre_actormodel.h5", overwrite=True)
                with open("pre_actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("pre_criticmodel.h5", overwrite=True)
                with open("pre_criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)
            
            if done:
                break

        print("TOTAL REWARD @ " + str(i) +"-th Episode  : Reward " + str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Exemplo n.º 31
0
for episode in range(4000):
    print('Episode: ', episode)
    if episode %1 ==0:
        ob = env.reset(relaunch=True)  # with torcs relaunch (avoid memory leak bug in torcs)
    else:
        ob = env.reset()
    for move in range(10000):
        if TARGET_MODEL:
            action = act(target_actor_model, observation_formatter(ob))
        else:
            action = act(actor_model, observation_formatter(ob))
        action = action.flatten()
        new_ob, reward, done, _ = env.step(action)
        reward = reward/400
        print('\nq-value: ', target_critic_model.predict(observation_formatter(ob, action)))
        print('reward: ', reward, '\n')
        if np.isnan(reward):
            break
        buffer.loc[len(buffer), :] = [ob, action, reward, new_ob, done]
        update_actor_critic_model(sess, [actor_model, critic_model, target_actor_model, target_critic_model], buffer,
                                  [action_gradient_holder, update_op, gradient_op], ITERATIONS, BATCH_SIZE)
        ob = new_ob
        EPSILON = max(EPSILON*EPSILON_DECAY, MINIMUM_EPSILON)
        #print('\nepsilon: ', EPSILON, '\n')
        if done:
            break
        
# shut down torcs
env.end()
Exemplo n.º 32
0
def playGame(train_indicator=1):    #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001     #Target Network HyperParameters
    LRA = 0.00005    #Learning rate for Actor
    LRC = 0.0005     #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 29  #of sensors input

    np.random.seed(1337)

    vision = False

    EXPLORE = 200000.
    if train_indicator:
        episode_count = 1000
    else:
        episode_count = 20
    max_steps = 4000
    step = 0
    if train_indicator:
        epsilon = 1
    else:
        epsilon = 0
    min_laptime = 10000000

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)

    buff = ReplayBuffer(BUFFER_SIZE)    #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    #Now load the weight
    # loading networks
    print("Now we load the weight")
    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state("saved_networks/")
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old network weights")
    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(relaunch=True)   #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
     
        total_reward = 0.
        # totalLaptime = 0.
        for j in range(max_steps):
            loss = 0
            if train_indicator:
                epsilon -= 1.0 / EXPLORE
                epsilon = max(epsilon, 0.10)
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])
            
            a_t_original = actor.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0],  0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1],  0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0], train_indicator)

            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
        
            buff.add(s_t, a_t[0], r_t, s_t1, done)      #Add replay buffer
            
            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_predict(new_states, actor.target_predict(new_states))
           
            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA*target_q_values[k]
       
            if (train_indicator):
                loss += critic.train_on_batch(states, actions, y_t)
                a_for_grad = actor.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            if np.mod(step, 100) == 0:
                print("Episode", i, "Step", step, "Epsilon", epsilon, "Action", a_t, "Reward", r_t, "Loss", loss) #, "curLapTime", ob.curLapTime)
        
            step += 1
            if i == 0:
                break
            if done:
                break

        # if np.mod(i, 3) == 0:
        if (train_indicator) and i > 0:
            if env.lapTime < min_laptime and env.num_lap == 10:
                min_laptime = env.lapTime
                print("Now we save model")
                saver.save(sess, 'saved_networks/' + 'network' + '-ddpg-{}'.format(i))

        print("TOTAL REWARD @ " + str(i) +"-th Episode  : Reward " + str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
def playGame(train_indicator=1,
             safety_constrain_flag=True):  #1 means Train, 0 means simply Run
    plt.ion()
    args = parser.parse_args()

    np.random.seed(1337)

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    # Define two intra-policies
    overtaking_policy = ActorNetwork(sess, args.state_size, args.action_size)
    following_policy = ActorNetwork(sess, args.state_size, args.action_size)
    try:
        overtaking_policy.model.load_weights("actormodel_overtaking.h5")
        overtaking_policy.target_model.load_weights("actormodel_overtaking.h5")
        following_policy.model.load_weights("actormodel_following.h5")
        following_policy.target_model.load_weights("actormodel_following.h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    # with fixed following policy
    #option_policies = [overtaking_policy,overtaking_policy,overtaking_policy,following_policy(0.5),following_policy(0.5),following_policy(0.5)]
    # with learned following policy
    option_policies = [
        overtaking_policy, overtaking_policy, overtaking_policy,
        following_policy, following_policy, following_policy
    ]

    termination_steps = [10, 20, 30, 10, 20, 30]

    # Define option-value function Q_Omega(s,omega): estimate values upon arrival
    critic = OptionValueCritic(args.state_size, args.option_size,
                               args.discount, args.learning_rate_critic,
                               args.epsilon, args.epsilon_min,
                               args.epsilon_decay, args.tau)

    try:
        critic.load("option_value_model.h5")
        print("Critic Weight load successfully")
    except:
        print("Cannot find the critic weight")

    history = np.zeros((args.nepisodes, 2))

    # Define a buffer space to store samples
    buff = ReplayBuffer(args.buffer_size)  #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=args.vision, throttle=True, gear_change=False)

    print("TORCS Experiment Start.")

    cumreward_list = []
    average_step_reward_list = []
    damage_rate_list = []
    epsilon_list = []
    results_list = []
    option_list = []
    trackPos_list = []
    speed_list = []
    epreward_list = []

    for episode in range(args.nepisodes):
        # Define variables to store values
        cumreward = 0.
        duration = 1
        option_switches = 0
        avgduration = 0.
        reward_option = 0
        total_options = 0
        damage_times = 0
        danger_time = 0
        collision_time = 0
        primitive_action_step = 0

        if np.mod(episode, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        state = np.hstack(
            (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ,
             ob.wheelSpinVel / 100.0, ob.rpm, ob.opponents))
        state = state.reshape(1, state.shape[0])

        for step in range(args.nsteps):
            total_options += 1
            option = critic.get_option(state, train_indicator)
            reward_option = 0
            for i in range(termination_steps[option]):
                primitive_action_step += 1
                action = option_policies[option].target_model.predict(state)
                '''
                if option == 0 or option == 1 or option == 2:
                    action = option_policies[option].target_model.predict(state)
                else:
                    action = option_policies[option].act(ob)
                '''
                print(action)
                action = Low_level_controller(action[0][0], action[0][1], ob,
                                              safety_constrain_flag)

                print("Option: {} Action:{}".format(option, action))
                ob, r_t_primitive, done, _ = env.step(action)
                if r_t_primitive == -30.0:
                    collision_time += 1
                elif r_t_primitive == -5.0:
                    danger_time += 1
                damage_times = collision_time + danger_time

                option_list.append(option)
                trackPos_list.append(ob.trackPos)
                speed_list.append(ob.speedX)
                epreward_list.append(r_t_primitive)

                reward_option = reward_option + args.discount**(
                    i) * r_t_primitive
                state_ = np.hstack(
                    (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                     ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm, ob.opponents))
                state_ = state_.reshape(1, state_.shape[0])
                state = state_
                if done:
                    break

            buff.add(state, option, reward_option, state_, done)

            cumreward += reward_option

            reward_ep_per_step = cumreward / primitive_action_step
            damage_rate = damage_times / primitive_action_step
            if done:
                break
        if train_indicator:
            batch = buff.getBatch(args.batch_size)
            critic.replay(batch)
            if episode % 10 == 0:
                critic.save("option_value_model.h5")
        if train_indicator:
            # Save the results
            cumreward_list.append(cumreward)
            average_step_reward_list.append(reward_ep_per_step)
            damage_rate_list.append(damage_rate)
            epsilon_list.append(critic.epsilon)
            results_list = [
                cumreward_list, average_step_reward_list, damage_rate_list,
                epsilon_list
            ]
            sio.savemat(
                'results_both_learned.mat', {
                    'total_reward': cumreward_list,
                    'average_reward': average_step_reward_list,
                    'epsilon': epsilon_list,
                    'damage_rate': damage_rate_list
                })
        else:
            sio.savemat(
                'test1lf1r.mat', {
                    'ep_reward': epreward_list,
                    'option': option_list,
                    'trackPos': trackPos_list,
                    'speed': speed_list
                })
            print('damage rate is:', damage_rate)

        history[episode, 0] = step
        history[episode, 1] = avgduration

        plt.figure(1)
        plt.hold(True)
        plt.subplot(311)
        plt.plot(episode, cumreward, 'ro')
        plt.xlabel('episode')
        plt.ylabel('Total reward per epsiode')
        plt.subplot(312)
        plt.hold(True)
        plt.plot(episode, cumreward / total_options, 'bo')
        plt.xlabel('episode')
        plt.ylabel('Average reward per option')
        plt.subplot(313)
        plt.hold(True)
        plt.plot(episode, critic.epsilon, 'go')
        plt.xlabel('episode')
        plt.ylabel('epsilon')

        plt.draw()
        plt.show()
        plt.pause(0.001)

    env.end()  # This is for shutting down TORCS
    plt.savefig('test.png')

    print("Finish.")