def sample_one(self):
        """
        MODIFIED SAMPLING FOR TORCS!
        """
        print
        print('START PLOTTING MODULE'.center(80, '='))
        roll_distance = []
        print
        print("TORCS Experiment Start".center(80, '='))
        env = TorcsEnv(vision=self.config.vision,
                       throttle=self.config.throttle)
        try:
            ob = env.reset()
            sonar, grayscale = self.image_to_sonar(ob.img)
            sonar = np.reshape(sonar, [19])
            state = np.concatenate(
                [sonar, np.array([ob.speedX, ob.speedY, ob.speedZ])], axis=0)
            obs, states, actions, rewards,sonars,grayscales = [], [], [], [],[],[]

            done = False  #has the episode ended?
            start_time = time.time()
            while not done and (time.time() - start_time < 300):
                states.append(state)
                obs.append(ob)
                sonars.append(sonar)
                grayscales.append(grayscale)
                state = np.concatenate(
                    [sonar, np.array([ob.speedX, ob.speedY, ob.speedZ])],
                    axis=0)

                action = self.sess.run(
                    self.sampled_action,
                    feed_dict={
                        self.observation_placeholder:
                        np.reshape(state, [1, self.observation_dim])
                    })[0]
                ob, reward, done, info = env.step(action)
                sonar, grayscale = self.image_to_sonar(ob.img)
                sonar = np.reshape(sonar, [19])

                #print('Action: ', action)
                actions.append(action)
                rewards.append(reward)
                roll_distance.append(env.distance_travelled)
                #print('Roll distance: ', roll_distance)
        except:
            raise

        finally:
            env.end()  # This is for shutting down TORCS
            print("Finished TORCS session".center(80, '='))
            print('Final distance: ', roll_distance[-1], ' [m]')
            print('END PLOTTING MODULE'.center(80, '='))
            #Plot some of the frames:
            self.grayscales = grayscales
            self.sonars = sonars
            self.obs = obs
            self.actions = actions
            self.roll_distance = roll_distance
            return
Exemplo n.º 2
0
def main():
    ppo = PPOAgent()
    env = TorcsEnv(text=True, vision=False, throttle=True, gear_change=False)
    saver = tf.train.Saver()

    with tf.Session() as sess:
        saver.restore(sess, './save_model13/models/model_59557.ckpt')

        for i in range(1000):
            obs = env.reset()
            state = convert_obs(obs)
            score, step = 0, 0

            while True:
                action = ppo.choose_action(state)
                next_obs, reward, done, _ = env.step(action)
                time.sleep(0.05)

                next_state = convert_obs(next_obs)

                score += reward
                step += 1

                if done:
                    print(step, score)
                    env.reset()
                    step, score = 0, 0
                else:
                    state = next_state
Exemplo n.º 3
0
def test_sac(sess, args, sac, saver):

    saver.restore(sess, 'ckpt/model')

    env = TorcsEnv(vision=False, throttle=True, gear_change=False)

    ob = env.reset(
        relaunch=True
    )  #relaunch TORCS every N episode because of the memory leak error

    s = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                   ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))

    done = False
    ep_rew = 0.0
    ep_len = 0

    while (not done):

        # deterministic actions at test time
        a = sac.get_action(s, True)

        ob, r, done, _ = env.step(a)
        s = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                       ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))

        ep_rew += r
        ep_len += 1

        if (ep_len >= args.max_ep_len):
            done = True

    print('test time performance: | rewards: ', ep_rew, ' | length: ', ep_len)
Exemplo n.º 4
0
def train_sac(sess, args, sac, saver):

    env = TorcsEnv(vision=False, throttle=True, gear_change=False)

    replay_buffer = ReplayBuffer(args.s_dim, args.a_dim, args.buff_size)

    for ep in range(args.total_ep):

        if np.mod(ep, 100) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every N episode because of the memory leak error
        else:
            ob = env.reset()

        s = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                       ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))

        done = False
        ep_rew = 0.0
        ep_len = 0

        while (not done):

            # first 10 episodes, just step on gas, drive straight
            if (ep > 10):
                a = sac.get_action(s)
            else:
                a = np.array([0.0, 1.0, 0.0])

            ob, r, done, _ = env.step(a)
            s2 = np.hstack(
                (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                 ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))

            ep_rew += r
            ep_len += 1

            if (ep_len >= args.max_ep_len):
                done = True

            replay_buffer.store(s, a, r, s2, float(done))

            s = s2

            batch = replay_buffer.sample_batch(args.batch_size)
            outs = sac.train(batch)

        print('episode: ', ep, ' | episode rewards: ', round(ep_rew, 4),
              ' | episode length: ', ep_len, ' | alpha/temperature: ', outs[9])
        with open("performance.txt", "a") as myfile:
            myfile.write(
                str(ep) + " " + str(ep_len) + " " + str(round(ep_rew, 4)) +
                " " + str(round(outs[9], 4)) + "\n")

        if (ep % 10 == 0):
            # save model
            saver.save(sess, 'ckpt/model')
Exemplo n.º 5
0
def main():
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    generator = Generator(sess, feat_dim, aux_dim, encode_dim, action_dim)
    base_model = ResNet50(weights='imagenet', include_top=False)
    feat_extractor = Model(
        input=base_model.input,
        output=base_model.get_layer('activation_40').output
    )

    try:
        generator.model.load_weights(param_path)
        print("Weight load successfully")
    except:
        print("cannot find weight")

    env = TorcsEnv(throttle=True, gear_change=False)

    print("Start driving ...")
    ob = env.reset(relaunch=True)
    feat, aux = get_state(ob, aux_dim, feat_extractor)

    encode = np.zeros((1, encode_dim), dtype=np.float32)
    encode[0, code] = 1
    print "Encode:", encode[0]

    pre_actions = np.load(pre_actions_path)["actions"]

    for i in xrange(MAX_STEP_LIMIT):
        if i < MIN_STEP_LIMIT:
            action = np.zeros(3, dtype=np.float32)
        elif i < MIN_STEP_LIMIT + PRE_STEP:
            action = pre_actions[i - MIN_STEP_LIMIT]
        else:
            action = generator.model.predict([feat, aux, encode])[0]

        ob, reward, done, _ = env.step(action)
        feat, aux = get_state(ob, aux_dim, feat_extractor)

        if i == MIN_STEP_LIMIT + PRE_STEP:
            print "Start deciding ..."

        print "Step:", i, "DistFromStart:", ob.distFromStart, \
                "TrackPos:", ob.trackPos, "Damage:", ob.damage.item(), \
                "Action: %.6f %.6f %.6f" % (action[0], action[1], action[2]), \
                "Speed:", ob.speedX * 200

        if done:
            break

    env.end()
    print("Finish.")
Exemplo n.º 6
0
def main():
    # Creating necessary directories
    track_no = 5
    experiment_name = "tensorboard-4"
    experiment_dir  = "experiment-%s/" % experiment_name
    datas_dir = experiment_dir + "datas-track-no-%d/" % track_no
    models_dir = datas_dir + "model/"

    if os.path.exists(experiment_dir) == False:
        print("%s dosen't exists" % experiment_dir)
        return

    if os.path.exists(datas_dir) == False:
        print("%s dosen't exists" % datas_dir)
        return

    if os.path.exists(models_dir) == False:
        print("%s dosen't exists" % models_dir)
        return

    state_dim = 4
    img_dim = [304, 412, 3]
    sess = tf.InteractiveSession()
    agent = Supervise(sess, state_dim, img_dim, models_dir)
    agent.load_network()

    MAX_STEP = 10000
    step = 0
    vision = True
    env = TorcsEnv(vision=vision, throttle=True, text_mode=False, track_no=track_no, random_track=False, track_range=(5, 8))
    for i in range(1):
        if np.mod(i, 3) == 0:
            ob = env.reset(relaunch=True)
        else:
            ob = env.reset()

        s_t = np.hstack((ob.speedX, ob.speedY, ob.speedZ, 0.0))
        i_t = ob.img
        # print(i_t)

        while step < MAX_STEP:
            action = agent.action(s_t, i_t)
            ob, reward, done, info = env.step([action, 0.16, 0])
            s_t = np.hstack((ob.speedX, ob.speedY, ob.speedZ, action))
            i_t = ob.img

            print("Step", step, "Action", action, "Reward", reward)
            if done == True:
                break

    env.end()
Exemplo n.º 7
0
def main():
    count = 0

    env = TorcsEnv(vision=False, throttle=True, gear_change=False, text=False)
    # action_fd = open('./trajectory/action15.csv', 'w', newline='')
    # observation_fd = open('./trajectory/observation15.csv', 'w', newline='')

    # action_writer = csv.writer(action_fd, delimiter=',')
    # observation_writer = csv.writer(observation_fd, delimiter=',')

    for ep in range(1, 5):
        done = False
        step, score = 0, 0
        if np.mod(ep, 5) == 0:
            ob = env.reset(relaunch=True)
            state = np.hstack(
                (ob.angle, ob.track, ob.trackPos, ob.speedY, ob.speedX,
                 ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))
            #observation_writer.writerow(state.tolist())

        else:
            ob = env.reset()
            state = np.hstack(
                (ob.angle, ob.track, ob.trackPos, ob.speedY, ob.speedX,
                 ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))
            #observation_writer.writerow(state.tolist())

        while not done:
            keyboard.hook(get_action)
            time.sleep(0.05)
            # write action value in action.csv

            # observation_writer.writerow(state.tolist())
            # action_writer.writerow([action[0], action[1], action[2]])

            next_ob, reward, done, info = env.step(action)

            next_state = np.hstack(
                (next_ob.angle, next_ob.track, next_ob.trackPos,
                 next_ob.speedY, next_ob.speedX, next_ob.speedZ,
                 next_ob.wheelSpinVel / 100.0, next_ob.rpm))
            #observation_writer.writerow(state.tolist())
            state = next_state
            # print(step, score)
            print(next_ob.lastLapTime)
            score += reward
            step += 1

    action_fd.close()
    observation_fd.close()
    sys.exit()
Exemplo n.º 8
0
def test():
    env = TorcsEnv(vision=True, throttle=False)
    ob = env.reset(relaunch=True)
    reward_sum = 0.0
    done = False

    count = 0
    while not done:
        act = model.predict(img_reshape(ob.img).astype('float32') / 255)
        #print(act)
        count += 1
        ob, reward, done, _ = env.step(act)
        reward_sum += reward
    env.end()
    print("Steps before crash: ", count, reward_sum)
    return count, reward_sum
Exemplo n.º 9
0
def programmatic_game(tree_program, track_name='practgt2.xml'):
    episode_count = 2
    max_steps = 100000
    window = 5

    # Generate a Torcs environment
    env = TorcsEnv(vision=False,
                   throttle=True,
                   gear_change=False,
                   track_name=track_name)

    logging.info("TORCS Experiment Start with Priors on " + track_name)
    for i_episode in range(episode_count):
        ob = env.reset(
            relaunch=True
        )  # relaunch TORCS every 3 episode because of the memory leak error
        tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY],
                   [ob.speedZ], [ob.rpm],
                   list(ob.wheelSpinVel / 100.0),
                   list(ob.track), [0, 0, 0]]
        newobs = [item for sublist in tempObs[:-1] for item in sublist]

        for j in range(max_steps):
            act_tree = tree_program.predict([newobs])
            action_prior = [act_tree[0][0], act_tree[0][1], act_tree[0][2]]

            tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY],
                       [ob.speedZ], [ob.rpm],
                       list(ob.wheelSpinVel / 100.0),
                       list(ob.track), action_prior]
            newobs = [item for sublist in tempObs[:-1] for item in sublist]

            ob, r_t, done, info = env.step(action_prior)
            if np.mod(j, 1000) == 0:
                logging.info("Episode " + str(i_episode) + " Distance " +
                             str(ob.distRaced) + " Lap Times " +
                             str(ob.lastLapTime))

            if done:
                print('Done. Steps: ', j)
                break

        env.end()  # This is for shutting down TORCS
        logging.info("Finish.")
Exemplo n.º 10
0
def testDDPG(sess, args, actor, critic):

    # Generate a Torcs environment
    env = TorcsEnv(vision=False, throttle=True, gear_change=False)    


    episode_count = args['episode_count']
    max_steps = args['max_steps']


    for i in range(episode_count):

        if np.mod(i, 100) == 0:
            ob = env.reset(relaunch=True)   #relaunch TORCS every N episode because of the memory leak error
        else:
            ob = env.reset()


        s = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))

        ep_reward = 0
        ep_ave_max_q = 0

        
        for j in range(max_steps):

            
            a = actor.predict(np.reshape(s, (1, actor.s_dim))) 
            # NOISE AT TEST TIME MAY BE REQUIRED TO STABILIZE ACTIONS
            a[0,:] += OU(x=a[0,:], mu=mu, sigma=sigma, theta=theta)

            ob, r, terminal, info = env.step(a[0])
            s2 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))


            s = s2
            ep_reward += r

            if terminal:

                print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward), \
                        i, (ep_ave_max_q / float(j))))
                break
Exemplo n.º 11
0
def play_trajectory(filepath,counts):
    
    # Defining torcs environment
    env = TorcsEnv(vision=VISION,throttle=True,gear_change=False)


    states,actions = load_expert_trajectory(filepath)
    
    for i in range(counts):
        print("Playing count : {}".format(i))
        if np.mod(i, 3) == 0:
            ob = env.reset(relaunch=True)
        else:
            ob = env.reset()

        state = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))

        for action in actions:
        
            # take action and observe reward and next state     
            ob, reward, done, info = env.step(action)

            if done:
                break
Exemplo n.º 12
0
def playGame(train_indicator=0):  #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor
    LRC = 0.001  #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 29  #of sensors input

    np.random.seed(1337)

    vision = False

    EXPLORE = 100000.
    episode_count = 2000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    # Now load the weight
    # print("Now we load the weight")
    # try:
    #     actor.model.load_weights("actormodel.h5")
    #     critic.model.load_weights("criticmodel.h5")
    #     actor.target_model.load_weights("actormodel.h5")
    #     critic.target_model.load_weights("criticmodel.h5")
    #     print("Weight load successfully")
    # except:
    #     print("Cannot find the weight")

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                         ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))
        print ob.track

        total_reward = 0.
        stucked = 0
        for j in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            if random.random() <= 0.1:
                print("********Now we apply the brake***********")
                noise_t[0][2] = train_indicator * max(
                    epsilon, 0) * OU.function(a_t_original[0][2], 0.2, 1.00,
                                              0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])

            s_t1 = np.hstack(
                (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                 ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train_indicator):
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t,
                  "Loss", loss)

            step += 1
            if done:
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("actormodel.h5", overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("criticmodel.h5", overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Exemplo n.º 13
0
def playGame(train_indicator=0):  #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor
    LRC = 0.001  #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 24  #of sensors input

    np.random.seed(1337)

    vision = False

    EXPLORE = 300000.
    episode_count = 20000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1.0
    # epsilon = 1
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    #Now load the weight
    load_name = "sample_v0_40"
    print("Now we load the weight")
    try:
        actor.model.load_weights("saved/actormodel_{}.h5".format(load_name))
        critic.model.load_weights("saved/criticmodel_{}.h5".format(load_name))
        actor.target_model.load_weights(
            "saved/actormodel_{}.h5".format(load_name))
        critic.target_model.load_weights(
            "saved/criticmodel_{}.h5".format(load_name))
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    plt.figure()
    overall_scores = []
    model_name = "sample_v0"

    print("TORCS Experiment Start.")

    attacks = []
    for i in range(-10, 0):
        val = i / 10.0
        attacks.append([77, val])
    # for i in range(45, 55):
    #     attacks.append([i, -1.5])
    #     attacks.append([i, 1.5])
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        # if np.mod(i, 3) == 0:
        #     ob = env.reset(relaunch=True)   #relaunch TORCS every 3 episode because of the memory leak error
        # else:
        #     ob = env.reset()
        ob = env.reset()

        s_t = np.hstack(
            (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ))

        total_reward = 0.
        cur_sample = []
        for j in range(max_steps):
            # if j == 50:
            # time.sleep(0.099)
            # continue
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            # if j > 120:
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]
            if j < 20 and train_indicator:
                a_t[0][1] += 0.5
            # if j == 71:
            #     print("cp attack!")
            #     if a_t[0][0] > 0:
            #         a_t[0][0] = -0.3
            #     else:
            #         a_t[0][0] = 0.3
            # print("%.2f"%a_t[0][0])
            # a_t[0][2] += 0.7
            # if ob.speedX > 0.6:
            # a_t[0][1] = 0
            if (j == attacks[i][0]):
                print('cp attack on {} with {}'.format(attacks[i][0],
                                                       attacks[i][1]))
                a_t[0][0] = attacks[i][1]
            ob, r_t, done, info = env.step(a_t[0])
            print "step: {} reward: {:.5f} action: {:.5f} {:.5f} {:.5f} ".format(
                j, r_t, a_t[0][0], a_t[0][1], a_t[0][2])

            # print "{:.5f} {:.5f} {:.5f} {:.5f} {:.5f}".format(r_t, ob.speedX, ob.speedY, ob.speedZ, ob.rpm)
            # if(r_t < -50):
            #     r_t -= 10000
            #     done = True
            if j > 20 and ob.rpm <= 0.09426:
                r_t -= 1000
                done = True

            theta = 0.1
            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX,
                              ob.speedY, ob.speedZ))
            # s_t1_new = np.array([val + np.abs(val)*random.uniform(-1,1)*theta for val in s_t1])
            # print(np.linalg.norm(s_t1_new - s_t1))
            # s_t1 = s_t1_new

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer
            cur_step_sample = [
                s_t.tolist(), a_t[0].tolist(), r_t,
                s_t1.tolist(), done
            ]
            cur_sample.append(cur_step_sample)

            # #Do the batch update
            # batch = buff.getBatch(BATCH_SIZE)
            # states = np.asarray([e[0] for e in batch])
            # actions = np.asarray([e[1] for e in batch])
            # rewards = np.asarray([e[2] for e in batch])
            # new_states = np.asarray([e[3] for e in batch])
            # dones = np.asarray([e[4] for e in batch])
            # y_t = np.asarray([e[1] for e in batch])

            # target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)])

            # for k in range(len(batch)):
            #     if dones[k]:
            #         y_t[k] = rewards[k]
            #     else:
            #         y_t[k] = rewards[k] + GAMMA*target_q_values[k]

            # if (train_indicator):
            #     loss += critic.model.train_on_batch([states,actions], y_t)
            #     a_for_grad = actor.model.predict(states)
            #     grads = critic.gradients(states, a_for_grad)
            #     actor.train(states, grads)
            #     actor.target_train()
            #     critic.target_train()

            total_reward += r_t
            s_t = s_t1

            # print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss)

            step += 1
            if done:
                break

            if j > 200:
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("saved/actormodel_{}_{}.h5".format(
                    model_name, int(step / 10000)),
                                         overwrite=True)
                # with open("actormodel.json", "w") as outfile:
                #     json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("saved/criticmodel_{}_{}.h5".format(
                    model_name, int(step / 10000)),
                                          overwrite=True)
                # with open("criticmodel.json", "w") as outfile:
                #     json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")
        s = "{},{},{:.3f},{},{}\n".format(i, j, total_reward, attacks[i][0],
                                          attacks[i][1])
        with open('logs/attack_{}.csv'.format(model_name), 'a') as the_file:
            the_file.write(s)
        # overall_scores.append(total_reward)
        # plt.clf()
        # plt.plot(overall_scores)
        # plt.savefig("train_plots/{}_{}.jpg".format(model_name, int(step/10000)))
        # with open('samples/{}_{:05d}.pk'.format(model_name, i), 'w') as outfile:
        #     pickle.dump(cur_sample, outfile)

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Exemplo n.º 14
0
        agent.actor.save_weights("ddpg_actor.h5", overwrite=True)
        agent.critic.save_weights("ddpg_critic.h5", overwrite=True)
    else:
        observe = env.reset()

    # get necessary information from the observation
    state = np.hstack((observe.angle, observe.track, observe.trackPos,
                       observe.speedX, observe.speedY, observe.speedZ,
                       observe.wheelSpinVel / 100.0, observe.rpm))
    done = False

    while not done:
        step += 1
        global_step += 1
        action = agent.get_action(state.reshape(1, state.shape[0]))
        observe, reward, done, info = env.step(action)
        score += reward
        next_state = np.hstack((observe.angle, observe.track, observe.trackPos,
                                observe.speedX, observe.speedY, observe.speedZ,
                                observe.wheelSpinVel / 100.0, observe.rpm))

        agent.append_sample(state, action, reward, next_state, done)

        if global_step > 1000:
            agent.train_model()

        # print(' step: ', step, ' action: ', action, ' reward: ', reward)
        state = next_state

        if done:
            print('episode: ', e, ' score: ', score, ' step: ', global_step,
Exemplo n.º 15
0
def playGame(checkpoints=None,
             train_indicator=1,
             eps=1.0):  #1 means Train, 0 means simply Run
    BUFFER_SIZE = 40000
    BATCH_SIZE = 16
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.01  #Learning rate for Actor
    LRC = 0.05  #Lerning rate for Critic

    vision = True
    action_dim = 3  #Steering/Acceleration/Brake

    if vision:
        state_dim = (64, 64, 3)  #of sensors input
    else:
        state_dim = 29
    np.random.seed(1337)

    EXPLORE = 1000000.
    episode_count = 2000
    max_steps = 8000000
    reward = 0
    done = False
    step = 0
    epsilon = eps
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)
    summary_writer = tf.train.SummaryWriter('logs', graph_def=sess.graph_def)
    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA,
                         vision, summary_writer)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC,
                           vision)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer
    history = History()

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)
    log_file = open('train_log.log', 'w')
    #Now load the weight
    print("Now we load the weight")
    try:
        actor.model.load_weights("actormodel_{}.h5".format(checkpoints))
        critic.model.load_weights("criticmodel_{}.h5".foramt(checkpoints))
        actor.target_model.load_weights("actormodel_{}.h5".format(checkpoints))
        critic.target_model.load_weights(
            "criticmodel_{}.h5".format(checkpoints))
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    print("TORCS Experiment Start.")
    max_reward = 0
    min_reward = 0

    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        if vision:
            history.fill((ob.img))
            s_t = history.get()
        else:
            s_t = np.hstack(
                (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                 ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))

        total_reward = 0.
        total_damage = 0.
        for j in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            if vision:
                a_t_original = actor.model.predict(
                    s_t.reshape((-1, ) + state_dim))
            else:
                a_t_original = actor.model.predict(s_t.reshape(
                    1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.30, 0.30)
            noise_t[0][1] = 0.1 + train_indicator * max(
                epsilon, 0) * OU.function(a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])
            damage = ob.damage

            if vision:
                last_s_t = history.get().copy()
                history.add((ob.img))
                next_s_t = history.get().copy()
                if np.mod(step, 4) == 0:
                    buff.add(last_s_t, a_t[0], r_t, next_s_t,
                             done)  #Add replay buffer
                s_t1 = history.get()
            else:
                s_t1 = np.hstack(
                    (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                     ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))
                buff.add(s_t, a_t[0], r_t, s_t1, done)

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            if vision:
                target_q_values = critic.target_model.predict([
                    new_states.reshape((-1, ) + state_dim),
                    actor.target_model.predict(new_states).reshape(
                        (-1, ) + (action_dim, ))
                ])
            else:
                target_q_values = critic.target_model.predict(
                    [new_states,
                     actor.target_model.predict(new_states)])
            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if train_indicator and buff.count() >= 1000:
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)

                actor.target_train()
                critic.target_train()

            total_reward += r_t
            total_damage += damage
            s_t = s_t1

            print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t,
                  "Loss", loss)

            step += 1
            if done:
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("actormodel_{}.h5".format(i),
                                         overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("criticmodel_{}.h5".format(i),
                                          overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)
        max_reward = max(max_reward, total_reward)
        min_reward = min(min_reward, total_reward)
        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward) + "  EPS " + str(epsilon))
        print("Total Step: " + str(step) + ' Max: ' + str(max_reward) +
              ' Min: ' + str(min_reward))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Exemplo n.º 16
0
    xmax = np.max(x)
    return np.linalg.norm(x / xmax) * xmax

for episode in range(4000):
    print('Episode: ', episode)
    if episode %1 ==0:
        ob = env.reset(relaunch=True)  # with torcs relaunch (avoid memory leak bug in torcs)
    else:
        ob = env.reset()
    for move in range(10000):
        if TARGET_MODEL:
            action = act(target_actor_model, observation_formatter(ob))
        else:
            action = act(actor_model, observation_formatter(ob))
        action = action.flatten()
        new_ob, reward, done, _ = env.step(action)
        reward = reward/400
        print('\nq-value: ', target_critic_model.predict(observation_formatter(ob, action)))
        print('reward: ', reward, '\n')
        if np.isnan(reward):
            break
        buffer.loc[len(buffer), :] = [ob, action, reward, new_ob, done]
        update_actor_critic_model(sess, [actor_model, critic_model, target_actor_model, target_critic_model], buffer,
                                  [action_gradient_holder, update_op, gradient_op], ITERATIONS, BATCH_SIZE)
        ob = new_ob
        EPSILON = max(EPSILON*EPSILON_DECAY, MINIMUM_EPSILON)
        #print('\nepsilon: ', EPSILON, '\n')
        if done:
            break
        
# shut down torcs
Exemplo n.º 17
0
agent = Agent(env)

print("TORCS Experiment Start.")
for i in range(episode_count):
    print("Episode : " + str(i))

    if np.mod(i, 3) == 0:
        # Sometimes you need to relaunch TORCS because of the memory leak error
        ob = env.reset(relaunch=True)
    else:
        ob = env.reset()

    total_reward = 0.
    for j in range(max_steps):
        action = agent.act(ob)

        ob, reward, done, _ = env.step(action)
        # print(ob)
        total_reward += reward

        step += 1
        if done:
            break

    print("TOTAL REWARD @ " + str(i) + " -th Episode  :  " + str(total_reward))
    print("Total Step: " + str(step))
    print("")

env.end()  # This is for shutting down TORCS
print("Finish.")
Exemplo n.º 18
0
def main():
    # 전문가 데이터 load
    # expert_states = np.genfromtxt('./observation_ddpg.csv', delimiter=',', dtype=np.float32)
    # expert_actions = np.genfromtxt('./action_ddpg.csv', delimiter=',', dtype=np.float32)

    expert_states = np.load('./expert_state.npy')
    expert_actions = np.load('./expert_action.npy')

    # Env, model load
    env = TorcsEnv(vision=False, throttle=True, text=True, gear_change=False)
    ppo = PPOAgent()
    D = Discriminaor()
    # saver = tf.train.Saver()
    saver = tf.train.import_meta_graph(
        './save_model10/max_score/max_model_111352.8971768222.ckpt.meta')

    score_buf, graph_d_reward = [], []

    MAX_STEP = 906
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver.restore(
            sess, './save_model10/max_score/max_model_111352.8971768222.ckpt')
        max_score, ep_score, max_ep_score, change_count = 0, 0, 0, 0

        for ep in range(NUM_EPISODE):
            action_buf, state_buf, reward_buf = [], [], []
            step, score = 0, 0
            done = False
            ep_score = 0
            # memory 때문에 20번마다 한번씩 relaunch
            if np.mod(ep, 20) == 0: obs = env.reset(relaunch=True)
            else: obs = env.reset()

            state = convert_obs(obs)

            while not (step == MAX_STEP):
                if done:
                    print('\nDone: {0}\n'.format(ep_score))

                    if ep_score > max_ep_score and ep_score > 25000 and MAX_STEP == 302:
                        max_score = score
                        saver.save(
                            sess, './save_model20/per_episode/epMAX_' +
                            str(step) + '_' + str(ep_score) + '.ckpt')
                        print(
                            '\n########## update max score and save model #########\n'
                        )

                    obs = env.reset()
                    state = convert_obs(obs)
                    max_ep_score = ep_score
                    ep_score = 0

                step += 1

                action = ppo.choose_action(state)
                next_obs, reward, done, _ = env.step(action)

                state_buf.append(state)
                action_buf.append(action)
                reward_buf.append(reward)

                score += reward
                ep_score += reward
                next_state = convert_obs(next_obs)
                state = next_state
                print('\r{}/{}'.format(step, MAX_STEP), flush=True, end='')

            score_buf.append(score)

            # Discriminator Train
            for _ in range(2):
                # sample_indices = (np.random.randint(low=0, high=expert_states.shape[0], size=MAX_STEP))
                # inp = [expert_states, expert_actions]
                # sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp]  # sample training data

                start_idx = np.random.randint(low=0, high=4799)
                start_idx = start_idx * 280

                sampled_expert_s = expert_states[start_idx:start_idx +
                                                 len(state_buf), :]
                sampled_expert_a = expert_actions[start_idx:start_idx +
                                                  len(action_buf), :]

                D.train(expert_s=sampled_expert_s,
                        expert_a=sampled_expert_a,
                        agent_s=np.vstack(state_buf),
                        agent_a=np.vstack(action_buf))

            d_rewards = D.get_rewards(agent_s=np.vstack(state_buf),
                                      agent_a=np.vstack(action_buf))
            d_reward_buf = [np.asscalar(r) for r in d_rewards]
            graph_d_reward.append(sum(d_reward_buf))
            if done:
                last_value = 0.0
            else:
                last_value = ppo.get_value(next_state)

            discounted_reward = []
            for r in d_reward_buf[::-1]:
                last_value = r + GAMMA * last_value
                discounted_reward.append(last_value)
            discounted_reward.reverse()
            batch_action = np.vstack(action_buf)
            batch_state = np.vstack(state_buf)
            batch_discount_reward = np.array(discounted_reward)[:, np.newaxis]
            ppo.update(batch_state, batch_action, batch_discount_reward)

            # if score > max_score and score > 90000:
            #     max_score = score
            #     saver.save(sess, './save_model17/max_score/max_model_'+str(max_score)+'.ckpt')
            #     print('\n########## update max score and save model #########\n')

            if ep % 50 == 0 and ep > 0:
                fig = plt.figure(figsize=(16, 8))
                plt.xlabel('EP')
                plt.ylabel('SCORE')
                plt.plot(list(range(len(score_buf))),
                         score_buf,
                         c='r',
                         lw=1,
                         ls='-')
                fig.savefig('./save_model20/graph/env_reward_graph.png')
                fig.clear()
                plt.clf()
            if ep % 50 == 0 and ep > 0:
                fig = plt.figure(figsize=(16, 8))
                plt.xlabel('EP')
                plt.ylabel('SCORE')
                plt.plot(list(range(len(graph_d_reward))),
                         graph_d_reward,
                         c='b',
                         lw=1,
                         ls='-')
                fig.savefig('./save_model20/graph/d_reward_graph.png')
                fig.clear()
                plt.clf()
                print('\n@@@@@@@@@@ save model(per 200 ep) @@@@@@@@@@\n')

            print(
                '\nEp: {0}\tScore(Env): {1:.6}\tReward(D): {2:.6}\tStep: {3}\n'
                .format(ep, score, sum(d_reward_buf), step))

            if score > 90000 and MAX_STEP == 906:
                change_count += 1
                if change_count == 100:
                    MAX_STEP = 604
                    ep_score, max_ep_score = 0, 0
            elif score > 60000 and MAX_STEP == 604:
                change_count += 1
                if change_count == 200:
                    MAX_STEP = 302
                    ep_score, max_ep_score = 0, 0

    os.system('pkill torcs')
Exemplo n.º 19
0
action_list = []
reward_list = []

env = TorcsEnv(vision=True, throttle=False)
ob = env.reset(relaunch=True)

print('Collecting data...')
for i in range(steps):
    if i == 0:
        act = np.array([0.0])
    else:
        act = get_teacher_action(ob)

    if i%100 == 0:
        print(i)
    ob, reward, done, _ = env.step(act)
    img_list.append(ob.img)
    action_list.append(act)
    reward_list.append(np.array([reward]))

env.end()

print('Packing data into arrays...')
for img, act, rew in zip(img_list, action_list, reward_list):
    images_all = np.concatenate([images_all, img_reshape(img)], axis=0)
    actions_all = np.concatenate([actions_all, np.reshape(act, [1,action_dim])], axis=0)
    rewards_all = np.concatenate([rewards_all, rew], axis=0)

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
Exemplo n.º 20
0
def playGame(train_indicator=0):  #1 means Train, 0 means simply Run
    time.sleep(1)
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor
    LRC = 0.001  #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 24  #of sensors input

    np.random.seed(1337)

    vision = False

    EXPLORE = 300000.
    episode_count = 20000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1.0
    # epsilon = 1
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)
    pre_model = load_model("weights_rescale_all-0000.hdf5")
    # x = np.array([ 4.82767379e-01,  5.92105016e-02,  3.61700505e-01,  2.74807483e-01,
    #     2.31401995e-01,  2.07236990e-01,  1.95800006e-01,  1.89892501e-01,
    #     1.84837490e-01,  1.81293502e-01,  1.77807003e-01,  1.74377009e-01,
    #     1.71005994e-01,  1.66384503e-01,  1.61247000e-01,  1.52030498e-01,
    #     1.35238498e-01,  1.11962005e-01,  8.79574940e-02,  4.76383008e-02,
    #     4.78339800e-01,  6.97819047e-01,  4.60800716e-01,  5.00754069e-01,
    #     -1.00000000e+00,  9.99979496e-01,  8.71338917e-13])
    # x_s = np.array([x, x])
    # pre_y = pre_model.predict(x_s)
    # print(x_s[0])
    # print(pre_y[0])

    #Now load the weight
    load_name = "sample_v0_40"
    print("Now we load the weight")
    try:
        actor.model.load_weights("saved/actormodel_{}.h5".format(load_name))
        critic.model.load_weights("saved/criticmodel_{}.h5".format(load_name))
        actor.target_model.load_weights(
            "saved/actormodel_{}.h5".format(load_name))
        critic.target_model.load_weights(
            "saved/criticmodel_{}.h5".format(load_name))
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    plt.figure()
    overall_scores = []
    model_name = "sample_v0"

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack(
            (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ))

        total_reward = 0.
        cur_sample = []
        attack_valid = 1
        gap = (i / 10) / 100.0
        attack_step = -1
        attack_target = 0
        for j in range(max_steps):
            # if j == 50:
            # time.sleep(0.099)
            # continue
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            # if j > 120:
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]
            if j < 20 and train_indicator:
                a_t[0][1] += 0.5
            # os.system("scrot saved_pic/{}.png".format(j))
            if j == 80:
                print("cp attack!")
                a_t[0][0] = -1.0
            if j == 83:
                os.system("scrot saved_pic/{}.png".format(j))
            #    if a_t[0][0] > 0:
            #         a_t[0][0] = -0.3
            #     else:
            #         a_t[0][0] = 0.3
            # print("%.2f"%a_t[0][0])
            # a_t[0][2] += 0.7
            # if ob.speedX > 0.6:
            # a_t[0][1] = 0
            # if(step == 60):
            # a_t[0][0] = 1.0
            # s_t_scaled = rescale_state(s_t)
            # # print(s_t[0])
            # s_t_0 = restore_state(s_t_scaled)
            # # print(s_t_0[0])
            # new_a_t = actor.model.predict(s_t_0.reshape(1, s_t_0.shape[0]))
            # s_t_scaled_list = np.array([np.copy(s_t_scaled) for val in range(21)])
            # actions = np.array([np.copy(a_t[0]) for val in range(21)])
            # for val in range(21):
            #     actions[val][0] = -1.0 + val/10.0
            # # print(actions)
            # x_0 = np.hstack((s_t_scaled_list, actions))
            # # print(x_0.shape, s_t_scaled_list.shape, actions.shape)
            # pre_y = pre_model.predict(x_0)
            # # print(x_0[0])
            # # print(pre_y[0])

            # steer_index = int(a_t[0][0]*10.0 + 10.0)
            # for pre_step in range(2):
            #     restore_new_Y = restore_states(pre_y)
            #     actions = actor.model.predict(restore_new_Y)
            #     x_step1 = np.hstack((pre_y, actions))
            #     pre_y = pre_model.predict(x_step1)

            # for index in range(21):
            #     diff = calsulate_d(pre_y[index]) - calsulate_d(pre_y[steer_index])
            #     pro = np.random.random()
            #     if diff > gap and attack_valid == 1 and pro > 0.8 and j > 50:
            #         a_t[0][0] = -1.0 + index/10.0
            #         print("adv!", diff, "pro:", pro)
            #         attack_step = j
            #         attack_target = a_t[0][0]
            #         attack_valid -= 1

            # dis_list = np.array([(calsulate_d(st) - calsulate_d(pre_y[steer_index])) for st in pre_y])
            # print("{:.2f}".format(max(dis_list)*100000))
            # print("{}".format(max(dis_list)*100000))

            # s_t_scaled = np.copy(s_t1)
            # s_t_scaled[0] = rescale_data(s_t_scaled[0], 0.5)
            # s_t_scaled[20] = rescale_data(s_t_scaled[20], 2.5)
            # s_t_scaled[21] = rescale_data(s_t_scaled[21], 0.7)
            # s_t_scaled[22] = rescale_data(s_t_scaled[22], 0.7)
            # s_t_scaled[23] = rescale_data(s_t_scaled[23], 0.7)
            # actions = actor.model.predict(s_t_scaled.reshape(1, s_t_scaled.shape[0]))
            # print(actions[0][0])

            # ob, r_t, done, info = env.step(new_a_t[0])
            ob, r_t, done, info = env.step(a_t[0])
            print "step: {} reward: {:.5f} action: {:.5f} {:.5f} {:.5f} ".format(
                j, r_t, a_t[0][0], a_t[0][1], a_t[0][2])
            # print(a_t[0][0])

            # print "{:.5f} {:.5f} {:.5f} {:.5f} {:.5f}".format(r_t, ob.speedX, ob.speedY, ob.speedZ, ob.rpm)
            # if(r_t < -50):
            #     r_t -= 10000
            #     done = True
            if j > 20 and ob.rpm <= 0.09426:
                r_t -= 1000
                done = True

            theta = 0.1
            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX,
                              ob.speedY, ob.speedZ))

            # action_states = []
            # for i in range(-5, 6):

            # s_t1_new = np.array([val + np.abs(val)*random.uniform(-1,1)*theta for val in s_t1])
            # print(np.linalg.norm(s_t1_new - s_t1))
            # s_t1 = s_t1_new

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer
            # cur_step_sample = [s_t.tolist(), a_t[0].tolist(), r_t, s_t1.tolist(), done]
            # cur_sample.append(cur_step_sample)

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train_indicator):
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            # print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss)

            step += 1
            if done:
                break

            if j > 500:
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("saved/actormodel_{}_{}.h5".format(
                    model_name, int(step / 10000)),
                                         overwrite=True)
                # with open("actormodel.json", "w") as outfile:
                #     json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("saved/criticmodel_{}_{}.h5".format(
                    model_name, int(step / 10000)),
                                          overwrite=True)
                # with open("criticmodel.json", "w") as outfile:
                #     json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")
        s = "{},{},{},{},{},{:.3f}\n".format(gap, attack_step, attack_target,
                                             i, j, total_reward)
        attack_valid = 1
        attack_step = -1
        attack_target = 0
        with open('logs/pm_adv_test.csv'.format(model_name), 'a') as the_file:
            the_file.write(s)
        overall_scores.append(total_reward)
        plt.clf()
        plt.plot(overall_scores)
        plt.savefig("train_plots/{}_{}.jpg".format(model_name,
                                                   int(step / 10000)))
        # with open('samples/{}_{:05d}.pk'.format(model_name, i), 'w') as outfile:
        # pickle.dump(cur_sample, outfile)

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Exemplo n.º 21
0
def playGame(train_indicator=1):    # 1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000  # 缓存能力,网络储存能力
    BATCH_SIZE = 32  # 批尺寸,一次处理样本数
    GAMMA = 0.99  # 折扣系数
    TAU = 0.001     # Target Network HyperParameters 目标网络超系数
    LRA = 0.0001    # Learning rate for Actor Actor网络学习率
    LRC = 0.001     # Lerning rate for Critic Critic网络学习率

    action_dim = 3  # Steering/Acceleration/Brake 加速/转向/刹车
    state_dim = 29  # of sensors input 29个传感器输入

    np.random.seed(1337)  # 随机数种子,如果使用相同的数字,则每次产生的随机数相同,应该是定义了一个随机的初始值。

    vision = False

    EXPLORE = 100000.
    episode_count = 2000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0

    # Tensorflow GPU 管理策略,此处使用动态内存申请策略
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    # 硬性限制GPU使用率为0.4
    # config.gpu_options.per_process_gpu_memory_fraction = 0.4
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)    # Create replay buffer

    #  Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True,gear_change=False)

    # Now load the weight
    print("Now we load the weight")
    try:
        actor.model.load_weights("actormodel.h5")
        critic.model.load_weights("criticmodel.h5")
        actor.target_model.load_weights("actormodel.h5")
        critic.target_model.load_weights("criticmodel.h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    theTime = datetime.datetime.now()  # 获取系统当前时间
    theTime = theTime.strftime('%y-%m-%d_%H:%M:%S')  # 转换为字符串形式作为CSV文件头
    folder_path = "practise_progress/" + theTime + "/"  # 只适用于Linux系统
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print("folder created")
    else:
        print("folder existed")

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(relaunch=True)   # relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
     
        total_reward = 0.

        csvfileHeader = "practise_progress/" + theTime + "/" + " Episode " + str(i) + ".csv"
        fileHeader = ["Step", "TrackPos", "SpeedX", "SpeedY", "SpeedZ",
                      "Action_Steering", "Action_Acceleration", "Action_Brake", "Reward", "Loss"]
        csvFile = open(csvfileHeader, "w")
        writer = csv.writer(csvFile)
        writer.writerow(fileHeader)

        for j in range(max_steps):
            loss = 0 
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])
            
            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0],  0.0 , 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1],  0.5 , 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1 , 1.00, 0.05)

            # The following code do the stochastic brake
            # if random.random() <= 0.1:
            #     print("********Now we apply the brake***********")
            #     noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])

            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
        
            buff.add(s_t, a_t[0], r_t, s_t1, done)      # Add replay buffer
            
            # Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)])  
           
            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA*target_q_values[k]
       
            if (train_indicator):
                loss += critic.model.train_on_batch([states,actions], y_t) 
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            csvData = [step, ob.trackPos, ob.speedX * 300, ob.speedY * 300, ob.speedZ * 300,
                       a_t[0, 0], a_t[0, 1], a_t[0, 2], r_t, loss]
            """        参数记录
                       轮次  步骤计数  车辆位置  X轴速度  Y轴速度  Z轴速度
                       加速输出  转向输出  刹车输出  回报  损失函"""
            writer.writerow(csvData)
            print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss)
            step += 1
            if done:
                csvFile.close()
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("actormodel.h5", overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("criticmodel.h5", overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)



        print("TOTAL REWARD @ " + str(i) +"-th Episode  : Reward " + str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  #  This is for shutting down TORCS
    print("Finish.")
def playGame(train_indicator=1,
             safety_constrain_flag=True):  #1 means Train, 0 means simply Run
    plt.ion()
    args = parser.parse_args()

    np.random.seed(1337)

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    # Define two intra-policies
    overtaking_policy = ActorNetwork(sess, args.state_size, args.action_size)
    following_policy = ActorNetwork(sess, args.state_size, args.action_size)
    try:
        overtaking_policy.model.load_weights("actormodel_overtaking.h5")
        overtaking_policy.target_model.load_weights("actormodel_overtaking.h5")
        following_policy.model.load_weights("actormodel_following.h5")
        following_policy.target_model.load_weights("actormodel_following.h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    # with fixed following policy
    #option_policies = [overtaking_policy,overtaking_policy,overtaking_policy,following_policy(0.5),following_policy(0.5),following_policy(0.5)]
    # with learned following policy
    option_policies = [
        overtaking_policy, overtaking_policy, overtaking_policy,
        following_policy, following_policy, following_policy
    ]

    termination_steps = [10, 20, 30, 10, 20, 30]

    # Define option-value function Q_Omega(s,omega): estimate values upon arrival
    critic = OptionValueCritic(args.state_size, args.option_size,
                               args.discount, args.learning_rate_critic,
                               args.epsilon, args.epsilon_min,
                               args.epsilon_decay, args.tau)

    try:
        critic.load("option_value_model.h5")
        print("Critic Weight load successfully")
    except:
        print("Cannot find the critic weight")

    history = np.zeros((args.nepisodes, 2))

    # Define a buffer space to store samples
    buff = ReplayBuffer(args.buffer_size)  #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=args.vision, throttle=True, gear_change=False)

    print("TORCS Experiment Start.")

    cumreward_list = []
    average_step_reward_list = []
    damage_rate_list = []
    epsilon_list = []
    results_list = []
    option_list = []
    trackPos_list = []
    speed_list = []
    epreward_list = []

    for episode in range(args.nepisodes):
        # Define variables to store values
        cumreward = 0.
        duration = 1
        option_switches = 0
        avgduration = 0.
        reward_option = 0
        total_options = 0
        damage_times = 0
        danger_time = 0
        collision_time = 0
        primitive_action_step = 0

        if np.mod(episode, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        state = np.hstack(
            (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ,
             ob.wheelSpinVel / 100.0, ob.rpm, ob.opponents))
        state = state.reshape(1, state.shape[0])

        for step in range(args.nsteps):
            total_options += 1
            option = critic.get_option(state, train_indicator)
            reward_option = 0
            for i in range(termination_steps[option]):
                primitive_action_step += 1
                action = option_policies[option].target_model.predict(state)
                '''
                if option == 0 or option == 1 or option == 2:
                    action = option_policies[option].target_model.predict(state)
                else:
                    action = option_policies[option].act(ob)
                '''
                print(action)
                action = Low_level_controller(action[0][0], action[0][1], ob,
                                              safety_constrain_flag)

                print("Option: {} Action:{}".format(option, action))
                ob, r_t_primitive, done, _ = env.step(action)
                if r_t_primitive == -30.0:
                    collision_time += 1
                elif r_t_primitive == -5.0:
                    danger_time += 1
                damage_times = collision_time + danger_time

                option_list.append(option)
                trackPos_list.append(ob.trackPos)
                speed_list.append(ob.speedX)
                epreward_list.append(r_t_primitive)

                reward_option = reward_option + args.discount**(
                    i) * r_t_primitive
                state_ = np.hstack(
                    (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                     ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm, ob.opponents))
                state_ = state_.reshape(1, state_.shape[0])
                state = state_
                if done:
                    break

            buff.add(state, option, reward_option, state_, done)

            cumreward += reward_option

            reward_ep_per_step = cumreward / primitive_action_step
            damage_rate = damage_times / primitive_action_step
            if done:
                break
        if train_indicator:
            batch = buff.getBatch(args.batch_size)
            critic.replay(batch)
            if episode % 10 == 0:
                critic.save("option_value_model.h5")
        if train_indicator:
            # Save the results
            cumreward_list.append(cumreward)
            average_step_reward_list.append(reward_ep_per_step)
            damage_rate_list.append(damage_rate)
            epsilon_list.append(critic.epsilon)
            results_list = [
                cumreward_list, average_step_reward_list, damage_rate_list,
                epsilon_list
            ]
            sio.savemat(
                'results_both_learned.mat', {
                    'total_reward': cumreward_list,
                    'average_reward': average_step_reward_list,
                    'epsilon': epsilon_list,
                    'damage_rate': damage_rate_list
                })
        else:
            sio.savemat(
                'test1lf1r.mat', {
                    'ep_reward': epreward_list,
                    'option': option_list,
                    'trackPos': trackPos_list,
                    'speed': speed_list
                })
            print('damage rate is:', damage_rate)

        history[episode, 0] = step
        history[episode, 1] = avgduration

        plt.figure(1)
        plt.hold(True)
        plt.subplot(311)
        plt.plot(episode, cumreward, 'ro')
        plt.xlabel('episode')
        plt.ylabel('Total reward per epsiode')
        plt.subplot(312)
        plt.hold(True)
        plt.plot(episode, cumreward / total_options, 'bo')
        plt.xlabel('episode')
        plt.ylabel('Average reward per option')
        plt.subplot(313)
        plt.hold(True)
        plt.plot(episode, critic.epsilon, 'go')
        plt.xlabel('episode')
        plt.ylabel('epsilon')

        plt.draw()
        plt.show()
        plt.pause(0.001)

    env.end()  # This is for shutting down TORCS
    plt.savefig('test.png')

    print("Finish.")
def preTrain(): # train the NN of actor and ciritc using existing rules
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001     #Target Network HyperParameters
    LRA = 0.0001    #Learning rate for Actor
    LRC = 0.001     #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 29  #of sensors input

    np.random.seed(1337)

    vision = False
    episode_count = 2000
    max_steps = 100000
    reward = 0
    done = False
    step = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)    #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True,gear_change=False)
    # Generate a driver
    driver = DriverExample()

    #Now load the weight
    print("Now we load the weight")
    try:
        actor.model.load_weights("pre_actormodel.h5")
        critic.model.load_weights("pre_criticmodel.h5")
        actor.target_model.load_weights("pre_actormodel.h5")
        critic.target_model.load_weights("pre_criticmodel.h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(relaunch=True)   #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
     
        total_reward = 0.
        for j in range(max_steps):
            loss_actor = 0
            loss_critic = 0
            a_t = np.zeros([1,action_dim])
            
            # the driver produce the actions
            a_t = driver.action(s_t.reshape(state_dim, ))

            ob, r_t, done, info = env.step(a_t)

            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
        
            buff.add(s_t, a_t, r_t, s_t1, done)      #Add replay buffer
            
            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)])  
           
            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA*target_q_values[k]
            """
            if (train_indicator == 1):
                loss += critic.model.train_on_batch([states,actions], y_t) 
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()
            """
            loss_actor += actor.model.train_on_batch(states, actions) # train actor
            loss_critic += critic.model.train_on_batch([states,actions], y_t) # train critic
            actor.target_train()
            critic.target_train()

            total_reward += r_t
            s_t = s_t1
        
            print("Episode", i, "Step", step, ": ")
            print("Action", a_t, "Reward", r_t)
            print("loss_actor", loss_actor, "loss_critic", loss_critic)
        
            step += 1

            if np.mod(step, 100) == 0:
                print("Now we save model")
                actor.model.save_weights("pre_actormodel.h5", overwrite=True)
                with open("pre_actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("pre_criticmodel.h5", overwrite=True)
                with open("pre_criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)
            
            if done:
                break

        print("TOTAL REWARD @ " + str(i) +"-th Episode  : Reward " + str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Exemplo n.º 24
0
def main():
    global play, replay_buffer, q_act_net, q_target_net, step
    #sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
    tstr = time.strftime('_%H_%M_%S_')

    first_run = False  #don't use saved nets/buffer
    reset_buffer = True  #don't use saved buffer

    #dqn params
    GAMMA = 0.99
    TAU = .0001  #.0001

    #exploration noise params
    act_noise_init = 0.5  #.75 #.25
    act_noise_final = .01  # .25
    act_noise_interval = 100000
    rnd_range = 1

    #lag augmentation
    packet_lost = 0  #0.01

    #action smoothing augmentation
    lambda_spatial_q = 0
    action_smoother = .33
    action_limiter = .33

    episode_count = 10000
    max_steps = 5000
    save_in_iters = 15000
    #100000

    #start training after accumulating train_start_num samples
    train_start_num = 1 * BATCH_SIZE

    caffe.set_mode_gpu()
    caffe.set_device(0)

    #balance track samples in replay buffer
    track_balance = .9

    #n-steps dqn, steps=max_n_batches*batch size
    max_n_batches = 16

    # use n-steps dqn for this ratio, rest 1-step
    n_step_ratio = .75

    #priority buffer
    k_priority_try = 2

    if n_step_ratio > 0 and max_n_batches > 0:
        n_steps_dqn = True
    else:
        n_steps_dqn = False

    #average speed
    c_speed = 35  # 50
    #speed variance
    delta_speed = 5  # 7.5
    #switch speed in
    swith_count_min = 50

    #target frame time
    frame_rate = .4
    #fail if lag is more then
    t_delta_fail = frame_rate * 1.75

    #for rebound handling
    rebound_count_max = 5

    start_run = 50
    # for error handling
    after_start_check = 100
    max_errors = 50

    #solver for current net
    if not play:
        critic_solver = caffe.get_solver(
            current_dir + 'resnet_torcs/dqn_critic_solver.prototxt')
    if first_run:
        # target net:
        q_target_net = caffe.Net(
            current_dir + 'resnet_torcs/critic_batch_dqn.prototxt',
            current_dir + 'r18nb.caffemodel', caffe.TEST)
        # current net:
        q_act_net = caffe.Net(
            current_dir + 'resnet_torcs/critic_deploy_dqn.prototxt',
            caffe.TEST)
        if not play:
            ParamCopy(critic_solver.net.params, q_target_net.params)
        ParamCopy(q_act_net.params, q_target_net.params)
    else:
        #target net:
        q_target_net = caffe.Net(
            current_dir + 'resnet_torcs/critic_batch_dqn.prototxt',
            'qq_target.caffemodel', caffe.TEST)
        #current net:
        q_act_net = caffe.Net(
            current_dir + 'resnet_torcs/critic_deploy_dqn.prototxt',
            'q_solver.caffemodel', caffe.TEST)
        if not play:
            ParamCopy(critic_solver.net.params, q_act_net.params)
        if not play and not reset_buffer:
            print 'loading replay_buffer buffer'
            replay_buffer = load_replay()
            replay_buffer.size_reduce(BUFFER_SIZE)
            print 'replay_buffer buffer loaded'

    print 'models loaded ***************************'

    if not play:
        assert q_target_net.blobs['state'].data.shape[0] == BATCH_SIZE
        assert q_act_net.blobs['state'].data.shape[0] == 1
        assert critic_solver.net.blobs['state'].data.shape[0] == BATCH_SIZE

        assert q_target_net.blobs['state'].data.shape[1] == CHANNELS
        assert q_act_net.blobs['state'].data.shape[1] == CHANNELS
        assert critic_solver.net.blobs['state'].data.shape[1] == CHANNELS

        assert q_target_net.blobs['q_action'].data.shape[1] == DISCR_A
        assert q_act_net.blobs['q_action'].data.shape[1] == DISCR_A
        assert critic_solver.net.blobs['q_action'].data.shape[1] == DISCR_A

    max_reached_step = 150  #used for track balance
    images_history = []  #used for input image
    step = 0  #total number of simulation steps
    save_count = 0  #used for saving nets/buffer
    n_batch = 0  #used for n-steps

    q_loss = 0  #main loss
    # Generate a Torcs environment
    env = TorcsEnv(vision=True, throttle=False, observer=False)
    time_start = time.time()
    track_id = 0  #track

    #n-step temp vars
    n_steps_cont_from_prev = False
    prev_start_pos = -1
    prev_track_id = -1
    Qlast = -1
    episod_steps = 0
    n_steps_used = 0
    batches_used = 0

    #for error failure
    rest_fail = 0

    rebound_events = 0

    for i in range(episode_count):
        #balance tracks
        if episod_steps >= max_reached_step * track_balance:
            track = t_list[track_id]
            change_track(
                "/usr/local/share/games/torcs/config/raceman/quickrace.xml",
                track)
            print "Track: ", track, "track_id", track_id
            episod_steps = 0

        print("Episode : " + str(i))
        ob = env.reset(relaunch=True)

        s_t = None  #input image
        total_reward = 0.

        #for randomizing velocity
        switch_count = swith_count_min + random.randint(0, swith_count_min)

        #for handling out-of-lane
        rebound = False
        rebound_count = 0
        track_pos = 0
        error_count = 0
        act_prev = np.array([0.])
        t_delta = 0

        for j in range(max_steps):
            max_reached_step = max(max_reached_step, j)
            a_t = np.array([0.])  #action
            skip_state = False
            error_present = False

            #exploration noise params
            act_noise = act_noise_init + (
                act_noise_final - act_noise_init) * min(
                    step * 1. / act_noise_interval, 1.)
            rnd_noise = 1
            if rnd_range > 1:
                rnd_noise = int(
                    (rnd_range + 1) *
                    max(1.,
                        float(act_noise_interval - step) / act_noise_interval))

            #get action =======================================================
            if s_t is None:
                action_index = random.randrange(DISCR_A)
                print '----------Random Action---------- action_index', action_index
                a_t[0] = ind2a(action_index, DISCR_A, DELTA_A)
            else:
                a_t[0] = qchoice(q_act_net, s_t, CHANNELS, DISCR_A, DELTA_A)
                #apply exploration noise
                if not play and random.random() <= act_noise:
                    ind = a2ind(a_t[0], DISCR_A, DELTA_A)
                    r = 1
                    if rnd_noise > 1:
                        r = randint(1, rnd_noise)
                    ind += randint(-r, r)
                    ind = min(max(ind, 0), DISCR_A - 1)
                    a_t[0] = ind2a(ind, DISCR_A, DELTA_A)

            #if still no action use random
            if a_t is None:
                action_index = random.randrange(DISCR_A)
                print 'rnd action_index', action_index
                a_t[0] = ind2a(action_index, DISCR_A, DELTA_A)

            #starting area
            if j < start_run:
                a_t[0] = 0

            #action limiter
            if not play and abs(
                    a_t[0]) > DELTA_A / 2 and random.random() < action_limiter:
                ind = a2ind(a_t[0], DISCR_A, DELTA_A)
                dind = ind - DISCR_A / 2
                if dind > (DISCR_A - 1) / 4:
                    dind = (DISCR_A - 1) / 4
                if dind < -(DISCR_A - 1) / 4:
                    dind = -(DISCR_A - 1) / 4
                a_t[0] = ind2a(dind + DISCR_A / 2, DISCR_A, DELTA_A)

            #save action
            a_0_list.append(a_t)

            #fail on render delay
            if not play and t_delta > t_delta_fail and i > rest_fail + 10 and j >= after_start_check:
                error_present = True
                if error_count >= max_errors / 2:
                    print 'delta fail **************************'
                    rest_fail = i
                    break
                else:
                    error_count += 1

            #randomize speed
            if (j % switch_count and not play) == 0:
                tag_speed_rnd = c_speed - delta_speed + random.uniform(
                    0, delta_speed * 2)
            else:
                tag_speed_rnd = c_speed

            #render delay compensation
            if t_delta > frame_rate:
                tag_speed = frame_rate / t_delta * tag_speed_rnd
            else:
                tag_speed = tag_speed_rnd

            #handle out-of-lane event
            if rebound:
                rebound_count = rebound_count_max
            else:
                rebound_count = max(0, rebound_count - 1)
            if (rebound_count > rebound_count_max / 2
                    and abs(track_pos) > .7) or rebound:
                angle = -observation.angle
                if angle * track_pos > 0 and abs(angle) > .2:
                    a_t[0] = -sign(track_pos) * 4 * DELTA_A / 5
                if angle * track_pos > 0 and abs(angle) <= .2:
                    a_t[0] = -sign(track_pos) * 2 * DELTA_A / 5
                if angle * track_pos < 0 and abs(angle) <= .15:
                    a_t[0] = -sign(track_pos) * DELTA_A / 5
                if angle * track_pos < 0 and abs(angle) > .15:
                    a_t[0] = 0
                if angle * track_pos < 0 and abs(angle) >= .35:
                    a_t[0] = sign(track_pos) * DELTA_A / 5
                tag_speed = min(tag_speed, 20)
                print "############ rebound, action", a_t[
                    0], "V angle", angle, "###############"

            #smooth action
            if not play and action_smoother > 0 and random.random(
            ) < action_smoother:
                ind_prev = a2ind(act_prev[0], DISCR_A, DELTA_A)
                ind = a2ind(a_t[0], DISCR_A, DELTA_A)
                if abs(ind - ind_prev) > 1:
                    print "smooth ind", ind, "->", np.rint(.5 *
                                                           (ind_prev + ind))
                    ind = int(.5 * (ind_prev + ind))
                    a_t[0] = ind2a(ind, DISCR_A, DELTA_A)

            a_act = a_t
            #lag augemntaion
            if not play and random.random < packet_lost and t_delta < frame_rate:
                a_act = act_prev

            #===================== main enviroment step =========================================
            obs0 = time.time()
            prev_rebound = rebound
            observation, r_t, done, rebound, _ = env.step(a_act, tag_speed)
            curr_time = time.time()
            t_delta = curr_time - time_start
            time_start = curr_time
            #====================================================================================
            if rebound and not prev_rebound:
                rebound_events += 1
            print 't_delta', t_delta, "step", j, "step time", curr_time - obs0, "tag_speed_rnd", tag_speed_rnd, "rebound_events", rebound_events
            if rebound:
                r_t = 0
            if prev_rebound and r_t == 0:
                skip_state = True

            #speed failure, could be moved to gym_torcs
            if observation.speedX < .01 and j >= after_start_check and t_delta < t_delta_fail:
                skip_state = True
                error_present = True
                r_t = 0
                if error_count >= max_errors:
                    print 'speed too slow fail, speed', 300 * observation.speedX, '**************************'
                    break
                else:
                    error_count += 1

            #make state ========================================================
            image = observation.img
            images_history.append(image)
            while len(images_history) > CHANNELS + 1:
                images_history.pop(0)
            s_t1 = make_state(images_history, CHANNELS)
            track_pos = observation.trackPos

            #save stat
            reward_list.append(r_t)
            track_list.append(track_pos)
            yspeed_list.append(observation.speedY)

            #store data into replay buffer ======================================
            do_store = not play and s_t is not None and s_t1 is not None and not skip_state
            if do_store:
                print 'add data, action', a_t[0], 'reward ', r_t
                w_p = j
                replay_buffer.add(s_t, a_t, r_t, s_t1, done, w_p, track_id, -1,
                                  -1)
                print '***** stored: track_pos', track_pos, 'angle', observation.angle,\
                    'max_step', max_reached_step, 'Episode', i
            elif not play:
                print 'skipped state track_pos', track_pos, 'angle', observation.angle,\
                    'max_step', max_reached_step, 'Episode', i

            #training  ======================================
            if not play and replay_buffer.num_experiences > train_start_num:
                #get batch using n-steps if previous batch was using n-step
                use_n_steps_now = n_steps_dqn
                if n_batch >= max_n_batches:
                    use_n_steps_now = False
                    n_batch = 0
                if n_steps_cont_from_prev and use_n_steps_now and max_n_batches > 1:
                    assert prev_start_pos >= 0
                    batch, n_steps_collected, prev_start_pos, prev_track_id =\
                        replay_buffer.getBatch4Pos(BATCH_SIZE, prev_start_pos, prev_track_id)
                    n_step_continued = n_steps_collected
                else:
                    n_step_continued = False
                if n_steps_used >= n_step_ratio * batches_used and not n_step_continued:
                    use_n_steps_now = False

                #get batch if previous batch was *not* using n-step
                if not n_step_continued:
                    batch, n_steps_collected, prev_start_pos, prev_track_id =\
                        replay_buffer.getBatch(BATCH_SIZE, max_n_batches, k_priority_try, n_steps=use_n_steps_now)

                #net training =============
                q_loss, Qlast = train_on_batch(batch, q_target_net,
                                               critic_solver, DISCR_A, DELTA_A,
                                               BATCH_SIZE, GAMMA,
                                               n_steps_collected,
                                               n_step_continued, Qlast,
                                               lambda_spatial_q)

                #update n-step vars
                if n_steps_collected:
                    n_batch += 1
                    n_steps_used += 1
                else:
                    n_batch = 0
                batches_used += 1
                n_steps_cont_from_prev = n_steps_collected and prev_start_pos >= 0

                # target update ==============
                SoftUpdate(q_target_net.params, critic_solver.net.params, TAU)
                ParamCopy(q_act_net.params, critic_solver.net.params)
                save_count += 1

            #save loss
            if not play:
                q_loss_list.append(q_loss)

            #update local vars
            s_t = s_t1
            act_prev = a_t
            if done:
                s_t = None
            if not error_present:
                error_count = max(0, error_count - 1)
            total_reward += r_t
            episod_steps += 1
            step += 1
            if done:
                break

        #save nets and buffer
        if not play and save_count >= save_in_iters:
            print "start save", save_count, step
            save_count = 0
            save_nets(q_act_net, q_target_net, step, replay_buffer)
            save_state(a_0_list, a_1_list, q_loss_list, reward_list,
                       track_list, yspeed_list, tstr + str(step))
        track_id = (track_id + 1) % len(t_list)
        print("TOTAL REWARD @ " + str(i) + " -th Episode  :  " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")
    print("Finishing torcs.")
    env.end()  # This is for shutting down TORCS
    #save nets and buffer
    if not play:
        save_state(a_0_list, a_1_list, q_loss_list, reward_list, track_list,
                   yspeed_list, tstr + str(step))
        save_nets(q_act_net, q_target_net, step, replay_buffer, "_finished")
    print 'Finish'
Exemplo n.º 25
0
agent = Agent(2)  # steering only

print("TORCS Experiment Start.")
for i in range(episode_count):
    print("Episode : " + str(i))

    if np.mod(i, 3) == 0:
        # Sometimes you need to relaunch TORCS because of the memory leak error
        ob = env.reset(relaunch=True)
    else:
        ob = env.reset()
    total_reward = 0.
    for j in range(max_steps):
        action = agent.act(ob, reward, done, False)

        ob, reward, done, info = env.step(action)
        total_reward += reward

        step += 1
        if done:
            print(info)
            break

    print("TOTAL REWARD @ " + str(i) +" -th Episode  :  " + str(total_reward))
    print("Total Step: " + str(step))
    print("")

env.end()  # This is for shutting down TORCS
print("Finish.")
Exemplo n.º 26
0
def playGame(train_indicator=1):  #1 means Train, 0 means simply Run
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 29  #of sensors input

    vision = False

    episode_count = 1
    max_steps = 1000  #100000
    done = False
    step = 0
    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, 1, TAU, LRA)
    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    #Now load the weight
    print("Now we load Actor model's weights")
    try:
        actor.model.load_weights("actormodel.h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        if np.mod(i, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                         ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))

        total_reward = 0.

        for j in range(max_steps):
            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))

            ob, r_t, done, info = env.step(a_t_original[0])

            s_t1 = np.hstack(
                (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                 ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))

            total_reward += r_t
            s_t = s_t1

            if np.mod(j, 100) == 0:
                print("Episode", i, "Step", step, "Action", a_t_original[0],
                      "Reward", r_t)

            step += 1
            if done:
                break

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Exemplo n.º 27
0
def main():
    """main method
    
    log runtime and print it at the end
    """
    s_time = timeit.default_timer()     
    global iteration
    env = TorcsEnv(vision=False, throttle=True, gear_change=False)
    memory = ReplayBuffer()
    epsilon = 1
    train_indicator = True
    modelPATH = os.path.join('.',"models",'E0011.pt')

    q,q_target = QNet(state_dim,action_dim),QNet(state_dim,action_dim)
    q_target.load_state_dict(q.state_dict())
    mu, mu_target = MuNet(state_dim), MuNet(state_dim)
    mu_target.load_state_dict(mu.state_dict())
    steer_noise = OUN(np.zeros(1),theta = 0.6)
    accel_noise = OUN(np.zeros(1),theta = 0.6)
    mu_optimizer = optim.Adam(mu.parameters(), lr=lr_mu)
    q_optimizer  = optim.Adam(q.parameters(), lr=lr_q)

    #tensorboard writer
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir = os.path.join("logs", "ddpg_torch", current_time+'E0011t')
    writer = SummaryWriter(log_dir)
    samplestate = torch.rand(1,29)
    sampleaction = torch.rand(1,2)

    #writer.add_graph(mu,samplestate)
    writer.add_graph(q,(samplestate,sampleaction))
    writer.close

    if train_indicator ==False:
        mu = torch.load(modelPATH)
        mu.eval()
        ob = env.reset()
        score = 0
        for n_step in range(100000):
            s_t = np.hstack((ob.angle, ob.track,ob.trackPos,ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
            a_t = mu(torch.from_numpy(s_t.reshape(1,-1)).float()).detach().numpy()
            ob,r_t,done,_ = env.step(a_t[0])
            score += r_t
            if done:
                print("score:",score)
                break
        env.end()
        return 0

    for n_epi in range(max_episode):
        print("Episode : " + str(n_epi) + " Replay Buffer " + str(memory.size()))
        if np.mod(n_epi, 3) == 0:
            ob = env.reset(relaunch=True)   #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()
        a_t = np.zeros([1,action_dim])
        s_t = np.hstack((ob.angle, ob.track,ob.trackPos,ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
        score = 0
        q_value_writer(q, mu, s_t, writer, 'Episode Start Q value')
        q_value_writer(q_target, mu_target, s_t, writer, 'Episode Start target Q value')
        #t_start = timeit.default_timer()
        for n_step in range(max_step):
            #epsilon -= 1.0/EXPLORE
            a_origin = mu(torch.from_numpy(s_t.reshape(1,-1)).float())
            if train_indicator == True:#add noise for train
                # sn = max(epsilon,0)*steer_noise()
                sn = steer_noise()
                # an = max(epsilon,0)*accel_noise()
                an = accel_noise()
                a_s = a_origin.detach().numpy()[0][0] + sn
                a_t[0][0] = np.clip(a_s,-1,1) # fit in steer arange
                a_a = a_origin.detach().numpy()[0][1] + an
                a_t[0][1] = np.clip(a_a,0,1) # fit in accel arange
                #record noise movement
                if iteration%10==0:
                    writer.add_scalar('Steer noise', sn, iteration)
                    writer.add_scalar('Accel_noise', an, iteration)
            else:
                a_t = a_origin.detatch().numpy()
            ob,r_t,done,_ = env.step(a_t[0])
            score += r_t

            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
            memory.put((s_t,a_t[0],r_t,s_t1,done))
            s_temp = copy.deepcopy(s_t) # for end q value log
            s_t = s_t1

            if train_indicator and memory.size()>train_start_size:
                train(mu, mu_target, q, q_target, memory, q_optimizer, mu_optimizer,writer)
                soft_update(mu, mu_target)
                soft_update(q,  q_target)
            
            iteration+=1

            if done:
                q_value_writer(q,mu,s_temp,writer,'Episode End Q value')
                q_value_writer(q_target,mu_target,s_temp,writer,'Episode End target Q value')
                break
        #t_end = timeit.default_timer()
        
        print("TOTAL REWARD @ " + str(n_epi) +"-th Episode  : Reward " + str(score))
        print("Total Step: " + str(n_step))
        print("")
        #print('{}steps, {} time spent'.format(i,t_end-t_start))
    
    torch.save(mu,modelPATH)
    
    env.end()
    
    e_time = timeit.default_timer()
    print("Total step {} and time spent {}".format(iteration, e_time-s_time))
Exemplo n.º 28
0
def playGame(train_indicator=1):    #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001     #Target Network HyperParameters
    LRA = 0.0001    #Learning rate for Actor
    LRC = 0.001     #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 29  #of sensors input

    np.random.seed(1337)

    vision = False

    EXPLORE = 100000.
    episode_count = 2000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)    #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True,gear_change=False)

    #Now load the weight
    print("Now we load the weight")
    try:
        actor.model.load_weights("actormodel.h5")
        critic.model.load_weights("criticmodel.h5")
        actor.target_model.load_weights("actormodel.h5")
        critic.target_model.load_weights("criticmodel.h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(relaunch=True)   #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
     
        total_reward = 0.
        for j in range(max_steps):
            loss = 0 
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1,action_dim])
            noise_t = np.zeros([1,action_dim])
            
            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0],  0.0 , 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1],  0.5 , 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1 , 1.00, 0.05)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])

            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
        
            buff.add(s_t, a_t[0], r_t, s_t1, done)      #Add replay buffer
            
            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)])  
           
            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA*target_q_values[k]
       
            if (train_indicator):
                loss += critic.model.train_on_batch([states,actions], y_t) 
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            if np.mod(step, 30) == 0:
                print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss)
        
            step += 1
            if done:
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("actormodel.h5", overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("criticmodel.h5", overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) +"-th Episode  : Reward " + str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Exemplo n.º 29
0
        if coach.step > 1:
            coach.train_examples.append([coach.last_state, coach.last_pi, r, coach.last_a])
            if len(coach.train_examples) >= coach.train_interval:
                coach.game.train_net(coach.train_examples)
                coach.train_examples = []

        # store current state waiting for next state to get reward
        state = np.reshape(image, (FLAGS.image_size,))
        coach.last_state = state

        coach.mcts = MCTS(coach.game, coach.avp_net)
        pi = coach.mcts.get_action_prob(state, w, 1)

        action = np.argmax(pi)
        np.set_printoptions(precision=4)
        pi = np.array(pi)
        coach.last_a = action
        coach.last_pi = pi
        steer_angle = -1.0 + action/(coach.action_num-1)*2.0
        a_t = np.zeros((1, ))
        a_t[0] = steer_angle
        obs, reward, done, _ = env.step(a_t)
        if reward < 0:
            reward = -1
        else:
            reward = 0.1
        print('eps', i, 'step', coach.step, 'r', r, 'action', a_t, 'pi', pi)
        if done:
            print('reset!'*10)
            obs = env.reset()
    def sample_path(self, num_episodes=None):
        """
    MODIFIED FOR TORCS!
    Sample path for the environment.
  
    Args:
            num_episodes:   the number of episodes to be sampled 
              if none, sample one batch (size indicated by config file)
    Returns:
        paths: a list of paths. Each path in paths is a dictionary with
            path["observation"] a numpy array of ordered observations in the path
            path["actions"] a numpy array of the corresponding actions in the path
            path["reward"] a numpy array of the corresponding rewards in the path
        total_rewards: the sum of all rewards encountered during this "path"

    """
        episode = 0
        episode_rewards = []
        episode_roll_distances = []
        paths = []
        t = 0
        i = 0
        print
        print("TORCS Experiment Start".center(80, '='))
        env = TorcsEnv(vision=self.config.vision,
                       throttle=self.config.throttle)
        #print('Num episodes', num_episodes)
        print('Using a batch size of: ', self.config.batch_size)
        try:
            while (num_episodes or t < self.config.batch_size):
                i += 1
                print('t', t, 'i', i)
                #Avoid a memory leak in TORCS by relaunching
                if np.mod(i, 10) == 0:
                    state = env.reset()
                else:
                    state = env.reset(relaunch=True)
                state = np.concatenate([
                    state.track,
                    np.array([state.speedX, state.speedY, state.speedZ])
                ],
                                       axis=0)
                states, actions, rewards = [], [], []
                episode_reward = 0

                for step in range(self.config.max_ep_len):
                    states.append(state)
                    #print('State', state)
                    action = self.sess.run(self.sampled_action,
                                           feed_dict={
                                               self.observation_placeholder:
                                               np.reshape(
                                                   states[-1],
                                                   [1, self.observation_dim])
                                           })[0]
                    state, reward, done, info = env.step(action)
                    #print('\n State track', state.track)
                    #print('\n State focus', state.focus)
                    state = np.concatenate([
                        state.track,
                        np.array([state.speedX, state.speedY, state.speedZ])
                    ],
                                           axis=0)

                    #print('State', state)
                    #print('Reward', reward)
                    #print('info', info)
                    actions.append(action)
                    rewards.append(reward)
                    episode_reward += reward
                    t += 1
                    if (done or step == self.config.max_ep_len - 1):
                        episode_rewards.append(episode_reward)
                        episode_roll_distances.append(env.distance_travelled)
                        break
                    if (not num_episodes) and t == self.config.batch_size:
                        break

                path = {
                    "observation": np.array(states),
                    "reward": np.array(rewards),
                    "action": np.array(actions)
                }
                paths.append(path)
                episode += 1
                if num_episodes and episode >= num_episodes:
                    break
        finally:
            env.end()  # This is for shutting down TORCS
            print("Finished TORCS session".center(80, '='))
        return paths, episode_rewards, episode_roll_distances
Exemplo n.º 31
0
def playGame(train_indicator=1):    #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001     #Target Network HyperParameters
    LRA = 0.00005    #Learning rate for Actor
    LRC = 0.0005     #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 29  #of sensors input

    np.random.seed(1337)

    vision = False

    EXPLORE = 200000.
    if train_indicator:
        episode_count = 1000
    else:
        episode_count = 20
    max_steps = 4000
    step = 0
    if train_indicator:
        epsilon = 1
    else:
        epsilon = 0
    min_laptime = 10000000

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)

    buff = ReplayBuffer(BUFFER_SIZE)    #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    #Now load the weight
    # loading networks
    print("Now we load the weight")
    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state("saved_networks/")
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old network weights")
    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(relaunch=True)   #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
     
        total_reward = 0.
        # totalLaptime = 0.
        for j in range(max_steps):
            loss = 0
            if train_indicator:
                epsilon -= 1.0 / EXPLORE
                epsilon = max(epsilon, 0.10)
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])
            
            a_t_original = actor.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0],  0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1],  0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0], train_indicator)

            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
        
            buff.add(s_t, a_t[0], r_t, s_t1, done)      #Add replay buffer
            
            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_predict(new_states, actor.target_predict(new_states))
           
            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA*target_q_values[k]
       
            if (train_indicator):
                loss += critic.train_on_batch(states, actions, y_t)
                a_for_grad = actor.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            if np.mod(step, 100) == 0:
                print("Episode", i, "Step", step, "Epsilon", epsilon, "Action", a_t, "Reward", r_t, "Loss", loss) #, "curLapTime", ob.curLapTime)
        
            step += 1
            if i == 0:
                break
            if done:
                break

        # if np.mod(i, 3) == 0:
        if (train_indicator) and i > 0:
            if env.lapTime < min_laptime and env.num_lap == 10:
                min_laptime = env.lapTime
                print("Now we save model")
                saver.save(sess, 'saved_networks/' + 'network' + '-ddpg-{}'.format(i))

        print("TOTAL REWARD @ " + str(i) +"-th Episode  : Reward " + str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Exemplo n.º 32
0
def playGame(f_diagnostics,
             train_indicator,
             agent,
             port=3101):  # 1 means Train, 0 means simply Run

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 65  #of sensors input
    env_name = 'Torcs_Env'
    save_location = "./weights/"

    # Generate a Torcs environment
    print("I have been asked to use port: ", port)
    env = TorcsEnv(vision=False, throttle=True, gear_change=False, main=1)
    ob = None
    while ob is None:
        try:
            client = snakeoil3.Client(p=port,
                                      vision=False)  # Open new UDP in vtorcs
            client.MAX_STEPS = np.inf
            client.get_servers_input(0)  # Get the initial input from torcs

            obs = client.S.d  # Get the current full-observation from torcs
            ob = env.make_observation(obs)

            s_t = np.hstack((ob.angle, ob.track, ob.trackPos, \
             ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm, ob.opponents))
        except:
            pass

    EXPLORE = total_explore
    episode_count = max_eps
    max_steps = max_steps_eps
    epsilon = epsilon_start
    done = False
    epsilon_steady_state = 0.01  # This is used for early stopping.

    totalSteps = 0
    best_reward = -100000
    running_avg_reward = 0.

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        save_indicator = 0
        early_stop = 1
        total_reward = 0.
        info = {'termination_cause': 0}
        distance_traversed = 0.
        speed_array = []
        trackPos_array = []

        print('\n\nStarting new episode...\n')
        print("Initial memory consumption: ")

        for step in range(max_steps):

            # Take noisy actions during training
            if (train_indicator == 1):
                epsilon -= 1.0 / EXPLORE
                epsilon = max(epsilon, epsilon_steady_state)
                a_t = agent.noise_action(
                    s_t, epsilon)  #Take noisy actions during training

            else:
                a_t = agent.action(s_t)

            try:
                ob, r_t, done, info = env.step(step, client, a_t, early_stop)
                if done:
                    break

                analyse_info(info, printing=False)

                s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, \
                 ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm, ob.opponents))
                distance_traversed += ob.speedX * np.cos(
                    ob.angle)  #Assuming 1 step = 1 second

                if (math.isnan(r_t)):
                    r_t = 0.0
                    for bad_r in range(50):
                        print('Bad Reward Found')
                    break  #Introduced by Anirban

            # Add to replay buffer only if training
                if (train_indicator):
                    agent.perceive(s_t, a_t, r_t, s_t1,
                                   done)  # Add experience to replay buffer

            except Exception as e:
                print("Exception caught at port " + str(i) + str(e))
                ob = None
                while ob is None:
                    try:
                        client = snakeoil3.Client(
                            p=port, vision=False)  # Open new UDP in vtorcs
                        client.MAX_STEPS = np.inf
                        client.get_servers_input(
                            0)  # Get the initial input from torcs
                        obs = client.S.d  # Get the current full-observation from torcs
                        ob = env.make_observation(obs)
                    except:
                        pass
                    continue
            total_reward += r_t
            s_t = s_t1

            # Displaying progress every 15 steps.
            if ((np.mod(step, 15) == 0)):
                print("Episode", i, "Step", step, "Epsilon", epsilon, "Action",
                      a_t, "Reward", r_t)

            totalSteps += 1
            if done:
                break

        # Saving the best model.
        running_avg_reward = running_average(running_avg_reward, i + 1,
                                             total_reward)

        if train_indicator == 1:

            #Save network after every 20 episodes and store the data
            if np.mod(i, 20) == 0:
                agent.saveNetwork(i)

        #Saving training data for client for analysis
        if train_indicator == 1 and np.mod(i, 5) == 0:
            f1 = open(str(port) + ".csv", "a+")
            client.printAnalysis(f1, i)
            f1.close()


        print("TOTAL REWARD @ " + str(i) +"-th Episode  : Num_Steps= " + str(step) + "; Max_steps= " \
         + str(max_steps)  +"; Reward= " + str(total_reward) + \
          "; Running average reward= " + str(running_avg_reward))
        print("Total Step: " + str(totalSteps))
        print("")

        print(info)
        try:
            if 'termination_cause' in info.keys(
            ) and info['termination_cause'] == 'hardReset':
                print('Hard reset by some agent')
                ob, client = env.reset(client=client, relaunch=True)
            else:
                ob, client = env.reset(client=client, relaunch=True)
        except Exception as e:
            print("Exception caught at point B at port " + str(i) + str(e))
            ob = None
            while ob is None:
                try:
                    client = snakeoil3.Client(
                        p=port, vision=False)  # Open new UDP in vtorcs
                    client.MAX_STEPS = np.inf
                    client.get_servers_input(
                        0)  # Get the initial input from torcs
                    obs = client.S.d  # Get the current full-observation from torcs
                    ob = env.make_observation(obs)
                except:
                    print("Exception caught at at point C at port " + str(i) +
                          str(e))


        s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, \
         ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm, ob.opponents))

    env.end()  # This is for shutting down TORCS
    f1.close()
    print("Finish.")